[llvm-commits] [llvm-gcc-4.2] r76781 [2/5] - in /llvm-gcc-4.2/trunk: ./ gcc/ gcc/config/ gcc/config/arm/ gcc/config/rs6000/ gcc/cp/ gcc/doc/ gcc/testsuite/g++.apple/ gcc/testsuite/g++.dg/abi/ gcc/testsuite/gcc.apple/ gcc/testsuite/gcc.target/arm/ gcc/testsuite/gcc.target/arm/neon/ gcc/testsuite/obj-c++.dg/ gcc/testsuite/objc.dg/

Wed Jul 22 13:36:46 PDT 2009

Added: llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h?rev=76781&view=auto

==============================================================================

--- llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h Wed Jul 22 15:36:27 2009
@@ -0,0 +1,12180 @@
+/* APPLE LOCAL file v7 support. Merge from Codesourcery */
+/* ARM NEON intrinsics include file. This file is generated automatically
+   using neon-gen.ml.  Please do not edit manually.
+
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 2, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to the
+   Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+
+/* As a special exception, if you include this header file into source
+   files compiled by GCC, this header file does not by itself cause
+   the resulting executable to be covered by the GNU General Public
+   License.  This exception does not however invalidate any other
+   reasons why the executable file might be covered by the GNU General
+   Public License.  */
+
+#ifndef _GCC_ARM_NEON_H
+#define _GCC_ARM_NEON_H 1
+
+#ifndef __ARM_NEON__
+#error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h
+#else
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+typedef __builtin_neon_qi int8x8_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_hi int16x4_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_si int32x2_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_di int64x1_t;
+typedef __builtin_neon_sf float32x2_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_poly8 poly8x8_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_poly16 poly16x4_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_uqi uint8x8_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_uhi uint16x4_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_usi uint32x2_t	__attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_udi uint64x1_t;
+typedef __builtin_neon_qi int8x16_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_hi int16x8_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_si int32x4_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_di int64x2_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_sf float32x4_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_poly8 poly8x16_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_poly16 poly16x8_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_uqi uint8x16_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_uhi uint16x8_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_usi uint32x4_t	__attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_udi uint64x2_t	__attribute__ ((__vector_size__ (16)));
+
+typedef __builtin_neon_sf float32_t;
+typedef __builtin_neon_poly8 poly8_t;
+typedef __builtin_neon_poly16 poly16_t;
+
+typedef struct int8x8x2_t
+{
+  int8x8_t val[2];
+} int8x8x2_t;
+
+typedef struct int8x16x2_t
+{
+  int8x16_t val[2];
+} int8x16x2_t;
+
+typedef struct int16x4x2_t
+{
+  int16x4_t val[2];
+} int16x4x2_t;
+
+typedef struct int16x8x2_t
+{
+  int16x8_t val[2];
+} int16x8x2_t;
+
+typedef struct int32x2x2_t
+{
+  int32x2_t val[2];
+} int32x2x2_t;
+
+typedef struct int32x4x2_t
+{
+  int32x4_t val[2];
+} int32x4x2_t;
+
+typedef struct int64x1x2_t
+{
+  int64x1_t val[2];
+} int64x1x2_t;
+
+typedef struct int64x2x2_t
+{
+  int64x2_t val[2];
+} int64x2x2_t;
+
+typedef struct uint8x8x2_t
+{
+  uint8x8_t val[2];
+} uint8x8x2_t;
+
+typedef struct uint8x16x2_t
+{
+  uint8x16_t val[2];
+} uint8x16x2_t;
+
+typedef struct uint16x4x2_t
+{
+  uint16x4_t val[2];
+} uint16x4x2_t;
+
+typedef struct uint16x8x2_t
+{
+  uint16x8_t val[2];
+} uint16x8x2_t;
+
+typedef struct uint32x2x2_t
+{
+  uint32x2_t val[2];
+} uint32x2x2_t;
+
+typedef struct uint32x4x2_t
+{
+  uint32x4_t val[2];
+} uint32x4x2_t;
+
+typedef struct uint64x1x2_t
+{
+  uint64x1_t val[2];
+} uint64x1x2_t;
+
+typedef struct uint64x2x2_t
+{
+  uint64x2_t val[2];
+} uint64x2x2_t;
+
+typedef struct float32x2x2_t
+{
+  float32x2_t val[2];
+} float32x2x2_t;
+
+typedef struct float32x4x2_t
+{
+  float32x4_t val[2];
+} float32x4x2_t;
+
+typedef struct poly8x8x2_t
+{
+  poly8x8_t val[2];
+} poly8x8x2_t;
+
+typedef struct poly8x16x2_t
+{
+  poly8x16_t val[2];
+} poly8x16x2_t;
+
+typedef struct poly16x4x2_t
+{
+  poly16x4_t val[2];
+} poly16x4x2_t;
+
+typedef struct poly16x8x2_t
+{
+  poly16x8_t val[2];
+} poly16x8x2_t;
+
+typedef struct int8x8x3_t
+{
+  int8x8_t val[3];
+} int8x8x3_t;
+
+typedef struct int8x16x3_t
+{
+  int8x16_t val[3];
+} int8x16x3_t;
+
+typedef struct int16x4x3_t
+{
+  int16x4_t val[3];
+} int16x4x3_t;
+
+typedef struct int16x8x3_t
+{
+  int16x8_t val[3];
+} int16x8x3_t;
+
+typedef struct int32x2x3_t
+{
+  int32x2_t val[3];
+} int32x2x3_t;
+
+typedef struct int32x4x3_t
+{
+  int32x4_t val[3];
+} int32x4x3_t;
+
+typedef struct int64x1x3_t
+{
+  int64x1_t val[3];
+} int64x1x3_t;
+
+typedef struct int64x2x3_t
+{
+  int64x2_t val[3];
+} int64x2x3_t;
+
+typedef struct uint8x8x3_t
+{
+  uint8x8_t val[3];
+} uint8x8x3_t;
+
+typedef struct uint8x16x3_t
+{
+  uint8x16_t val[3];
+} uint8x16x3_t;
+
+typedef struct uint16x4x3_t
+{
+  uint16x4_t val[3];
+} uint16x4x3_t;
+
+typedef struct uint16x8x3_t
+{
+  uint16x8_t val[3];
+} uint16x8x3_t;
+
+typedef struct uint32x2x3_t
+{
+  uint32x2_t val[3];
+} uint32x2x3_t;
+
+typedef struct uint32x4x3_t
+{
+  uint32x4_t val[3];
+} uint32x4x3_t;
+
+typedef struct uint64x1x3_t
+{
+  uint64x1_t val[3];
+} uint64x1x3_t;
+
+typedef struct uint64x2x3_t
+{
+  uint64x2_t val[3];
+} uint64x2x3_t;
+
+typedef struct float32x2x3_t
+{
+  float32x2_t val[3];
+} float32x2x3_t;
+
+typedef struct float32x4x3_t
+{
+  float32x4_t val[3];
+} float32x4x3_t;
+
+typedef struct poly8x8x3_t
+{
+  poly8x8_t val[3];
+} poly8x8x3_t;
+
+typedef struct poly8x16x3_t
+{
+  poly8x16_t val[3];
+} poly8x16x3_t;
+
+typedef struct poly16x4x3_t
+{
+  poly16x4_t val[3];
+} poly16x4x3_t;
+
+typedef struct poly16x8x3_t
+{
+  poly16x8_t val[3];
+} poly16x8x3_t;
+
+typedef struct int8x8x4_t
+{
+  int8x8_t val[4];
+} int8x8x4_t;
+
+typedef struct int8x16x4_t
+{
+  int8x16_t val[4];
+} int8x16x4_t;
+
+typedef struct int16x4x4_t
+{
+  int16x4_t val[4];
+} int16x4x4_t;
+
+typedef struct int16x8x4_t
+{
+  int16x8_t val[4];
+} int16x8x4_t;
+
+typedef struct int32x2x4_t
+{
+  int32x2_t val[4];
+} int32x2x4_t;
+
+typedef struct int32x4x4_t
+{
+  int32x4_t val[4];
+} int32x4x4_t;
+
+typedef struct int64x1x4_t
+{
+  int64x1_t val[4];
+} int64x1x4_t;
+
+typedef struct int64x2x4_t
+{
+  int64x2_t val[4];
+} int64x2x4_t;
+
+typedef struct uint8x8x4_t
+{
+  uint8x8_t val[4];
+} uint8x8x4_t;
+
+typedef struct uint8x16x4_t
+{
+  uint8x16_t val[4];
+} uint8x16x4_t;
+
+typedef struct uint16x4x4_t
+{
+  uint16x4_t val[4];
+} uint16x4x4_t;
+
+typedef struct uint16x8x4_t
+{
+  uint16x8_t val[4];
+} uint16x8x4_t;
+
+typedef struct uint32x2x4_t
+{
+  uint32x2_t val[4];
+} uint32x2x4_t;
+
+typedef struct uint32x4x4_t
+{
+  uint32x4_t val[4];
+} uint32x4x4_t;
+
+typedef struct uint64x1x4_t
+{
+  uint64x1_t val[4];
+} uint64x1x4_t;
+
+typedef struct uint64x2x4_t
+{
+  uint64x2_t val[4];
+} uint64x2x4_t;
+
+typedef struct float32x2x4_t
+{
+  float32x2_t val[4];
+} float32x2x4_t;
+
+typedef struct float32x4x4_t
+{
+  float32x4_t val[4];
+} float32x4x4_t;
+
+typedef struct poly8x8x4_t
+{
+  poly8x8_t val[4];
+} poly8x8x4_t;
+
+typedef struct poly8x16x4_t
+{
+  poly8x16_t val[4];
+} poly8x16x4_t;
+
+typedef struct poly16x4x4_t
+{
+  poly16x4_t val[4];
+} poly16x4x4_t;
+
+typedef struct poly16x8x4_t
+{
+  poly16x8_t val[4];
+} poly16x8x4_t;
+
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vadd_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vadddi (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vadd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vaddv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vadd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vadddi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vaddv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vaddv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vaddv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vaddq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vaddv2di (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vaddq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vaddv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vaddl_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vaddlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vaddl_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vaddl_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vaddlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vaddlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vaddlv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vaddw_s8 (int16x8_t __a, int8x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vaddwv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vaddw_s16 (int32x4_t __a, int16x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vaddwv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vaddw_s32 (int64x2_t __a, int32x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vaddwv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vaddwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vaddwv2si ((int64x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vhadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vhadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vhadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vhaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vhaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vhaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrhadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrhadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrhadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vqaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqadd_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vqadddi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vqaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vqaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vqaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vqadddi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vqaddv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqaddv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqaddv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqaddq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vqaddv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vqaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vqaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vqaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vqaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vaddhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vaddhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vaddhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vraddhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vraddhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vraddhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmul_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vmulv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vmulv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vmulv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vmulv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmul_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vmulv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vmulv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vmul_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (poly8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmulq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vmulv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vmulv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vmulv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vmulv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vmulv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vmulv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (poly8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vmullv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vmullv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vmullv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vmullv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vmullv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (poly16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint8x8_t)__builtin_neon_vmlav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return (uint16x4_t)__builtin_neon_vmlav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return (uint32x2_t)__builtin_neon_vmlav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+  return (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+  return (uint8x16_t)__builtin_neon_vmlav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vmlav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmlav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int16x8_t)__builtin_neon_vmlalv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmlalv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int64x2_t)__builtin_neon_vmlalv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vmlalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmlalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return (uint64x2_t)__builtin_neon_vmlalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint8x8_t)__builtin_neon_vmlsv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return (uint16x4_t)__builtin_neon_vmlsv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return (uint32x2_t)__builtin_neon_vmlsv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+  return (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+  return (uint8x16_t)__builtin_neon_vmlsv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vmlsv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmlsv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int16x8_t)__builtin_neon_vmlslv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmlslv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int64x2_t)__builtin_neon_vmlslv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vmlslv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmlslv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return (uint64x2_t)__builtin_neon_vmlslv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vsubv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vsubv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vsubv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsub_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vsubdi (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vsub_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vsubv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsub_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vsubdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vsubv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vsubv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vsubv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsubq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vsubv2di (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vsubq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vsubv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubl_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vsublv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubl_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsubl_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vsublv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vsublv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vsublv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubw_s8 (int16x8_t __a, int8x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vsubwv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubw_s16 (int32x4_t __a, int16x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vsubwv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsubw_s32 (int64x2_t __a, int32x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vsubwv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vsubwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vsubwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vsubwv2si ((int64x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vhsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vhsubv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vhsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vhsubv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vhsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vhsubv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vhsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vhsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vhsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vhsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vhsubv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vhsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vhsubv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vhsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vhsubv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vhsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vhsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vhsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vqsubv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqsubv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqsubv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqsub_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vqsubdi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vqsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vqsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vqsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vqsubdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vqsubv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqsubv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqsubv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqsubq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vqsubv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vqsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vqsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vqsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vqsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 2);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceq_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceq_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcge_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcge_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgev2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcge_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcge_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgeq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgeq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgev4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcle_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcle_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcle_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgev2si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcle_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcle_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __b, (int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcle_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __b, (int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcle_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __b, (int32x2_t) __a, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcleq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcleq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcleq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgev4si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __b, (int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __b, (int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __b, (int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgt_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgt_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclt_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclt_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclt_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclt_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __b, (int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclt_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __b, (int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclt_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __b, (int32x2_t) __a, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcltq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __b, (int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __b, (int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __b, (int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcage_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcageq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcale_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcaleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcagt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcagtq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcalt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcaltq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vtst_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vtst_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vtst_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vtst_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vtstq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vtstq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vabd_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vabdv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vabd_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vabdv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vabd_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vabdv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vabd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vabdv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vabd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vabdv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vabd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vabdv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vabd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vabdv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vabdq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vabdv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabdq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vabdv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabdq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vabdv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vabdq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vabdv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vabdv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vabdv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vabdv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabdl_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vabdlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabdl_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vabdlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vabdl_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vabdlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vabdlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vabdlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vabdlv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int8x8_t)__builtin_neon_vabav8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int16x4_t)__builtin_neon_vabav4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int32x2_t)__builtin_neon_vabav2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint8x8_t)__builtin_neon_vabav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return (uint16x4_t)__builtin_neon_vabav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return (uint32x2_t)__builtin_neon_vabav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+  return (int8x16_t)__builtin_neon_vabav16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int16x8_t)__builtin_neon_vabav8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vabav4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+  return (uint8x16_t)__builtin_neon_vabav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vabav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vabav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int16x8_t)__builtin_neon_vabalv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vabalv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int64x2_t)__builtin_neon_vabalv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vabalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vabalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return (uint64x2_t)__builtin_neon_vabalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmax_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vmaxv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmax_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vmaxv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmax_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vmaxv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmax_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vmaxv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmax_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmax_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmax_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmaxq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vmaxv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmaxq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vmaxv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmaxq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vmaxv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmaxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vmaxv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vmaxv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vmaxv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vmaxv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmin_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vminv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmin_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vminv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmin_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vminv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmin_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vminv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmin_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmin_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmin_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vminv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vminq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vminv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vminq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vminv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vminq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vminv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vminq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vminv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vminq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vminv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vminq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vminv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vminq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vminv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vpadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vpaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpadd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vpaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vpaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vpaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpaddl_s8 (int8x8_t __a)
+{
+  return (int16x4_t)__builtin_neon_vpaddlv8qi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpaddl_s16 (int16x4_t __a)
+{
+  return (int32x2_t)__builtin_neon_vpaddlv4hi (__a, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vpaddl_s32 (int32x2_t __a)
+{
+  return (int64x1_t)__builtin_neon_vpaddlv2si (__a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpaddl_u8 (uint8x8_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vpaddlv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpaddl_u16 (uint16x4_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vpaddlv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vpaddl_u32 (uint32x2_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vpaddlv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vpaddlq_s8 (int8x16_t __a)
+{
+  return (int16x8_t)__builtin_neon_vpaddlv16qi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vpaddlq_s16 (int16x8_t __a)
+{
+  return (int32x4_t)__builtin_neon_vpaddlv8hi (__a, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vpaddlq_s32 (int32x4_t __a)
+{
+  return (int64x2_t)__builtin_neon_vpaddlv4si (__a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vpaddlq_u8 (uint8x16_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vpaddlv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vpaddlq_u16 (uint16x8_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vpaddlv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vpaddlq_u32 (uint32x4_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vpaddlv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpadal_s8 (int16x4_t __a, int8x8_t __b)
+{
+  return (int16x4_t)__builtin_neon_vpadalv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpadal_s16 (int32x2_t __a, int16x4_t __b)
+{
+  return (int32x2_t)__builtin_neon_vpadalv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vpadal_s32 (int64x1_t __a, int32x2_t __b)
+{
+  return (int64x1_t)__builtin_neon_vpadalv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vpadalv8qi ((int16x4_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vpadalv4hi ((int32x2_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vpadalv2si ((int64x1_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vpadalq_s8 (int16x8_t __a, int8x16_t __b)
+{
+  return (int16x8_t)__builtin_neon_vpadalv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vpadalq_s16 (int32x4_t __a, int16x8_t __b)
+{
+  return (int32x4_t)__builtin_neon_vpadalv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vpadalq_s32 (int64x2_t __a, int32x4_t __b)
+{
+  return (int64x2_t)__builtin_neon_vpadalv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vpadalv16qi ((int16x8_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vpadalv8hi ((int32x4_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vpadalv4si ((int64x2_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vpmax_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vpmaxv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpmax_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vpmaxv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpmax_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vpmaxv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpmax_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vpmaxv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vpmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vpmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vpmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vpmin_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vpminv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpmin_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vpminv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpmin_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vpminv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpmin_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vpminv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vpminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vpminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vpminv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrecps_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vshldi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vrshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vshldi (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vrshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vrshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqrshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqrshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshr_n_s8 (int8x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshr_n_s16 (int16x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshr_n_s32 (int32x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vshr_n_s64 (int64x1_t __a, const int __b)
+{
+  return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshr_n_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshr_n_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshr_n_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vshr_n_u64 (uint64x1_t __a, const int __b)
+{
+  return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vshrq_n_s8 (int8x16_t __a, const int __b)
+{
+  return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshrq_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshrq_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshrq_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vshrq_n_u8 (uint8x16_t __a, const int __b)
+{
+  return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshrq_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshrq_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshrq_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrshr_n_s8 (int8x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrshr_n_s16 (int16x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrshr_n_s32 (int32x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vrshr_n_s64 (int64x1_t __a, const int __b)
+{
+  return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrshr_n_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrshr_n_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrshr_n_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vrshr_n_u64 (uint64x1_t __a, const int __b)
+{
+  return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrshrq_n_s8 (int8x16_t __a, const int __b)
+{
+  return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrshrq_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrshrq_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vrshrq_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrshrq_n_u8 (uint8x16_t __a, const int __b)
+{
+  return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrshrq_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrshrq_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vrshrq_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshrn_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshrn_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshrn_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrshrn_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrshrn_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrshrn_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqshrn_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqshrn_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqshrn_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqrshrn_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrshrn_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrshrn_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqrshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqrshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqrshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshrun_n_s16 (int16x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshrun_n_s32 (int32x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshrun_n_s64 (int64x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqrshrun_n_s16 (int16x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqrshrun_n_s32 (int32x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqrshrun_n_s64 (int64x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshl_n_s8 (int8x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshl_n_s16 (int16x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshl_n_s32 (int32x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vshl_n_s64 (int64x1_t __a, const int __b)
+{
+  return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshl_n_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshl_n_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshl_n_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vshl_n_u64 (uint64x1_t __a, const int __b)
+{
+  return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vshlq_n_s8 (int8x16_t __a, const int __b)
+{
+  return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshlq_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshlq_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshlq_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vshlq_n_u8 (uint8x16_t __a, const int __b)
+{
+  return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshlq_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshlq_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshlq_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqshl_n_s8 (int8x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vqshl_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqshl_n_s16 (int16x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vqshl_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqshl_n_s32 (int32x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vqshl_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqshl_n_s64 (int64x1_t __a, const int __b)
+{
+  return (int64x1_t)__builtin_neon_vqshl_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshl_n_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshl_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshl_n_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshl_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshl_n_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshl_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqshl_n_u64 (uint64x1_t __a, const int __b)
+{
+  return (uint64x1_t)__builtin_neon_vqshl_ndi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqshlq_n_s8 (int8x16_t __a, const int __b)
+{
+  return (int8x16_t)__builtin_neon_vqshl_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqshlq_n_s16 (int16x8_t __a, const int __b)
+{
+  return (int16x8_t)__builtin_neon_vqshl_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqshlq_n_s32 (int32x4_t __a, const int __b)
+{
+  return (int32x4_t)__builtin_neon_vqshl_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqshlq_n_s64 (int64x2_t __a, const int __b)
+{
+  return (int64x2_t)__builtin_neon_vqshl_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqshlq_n_u8 (uint8x16_t __a, const int __b)
+{
+  return (uint8x16_t)__builtin_neon_vqshl_nv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqshlq_n_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vqshl_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqshlq_n_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vqshl_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqshlq_n_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint64x2_t)__builtin_neon_vqshl_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshlu_n_s8 (int8x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshlu_n_s16 (int16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshlu_n_s32 (int32x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqshlu_n_s64 (int64x1_t __a, const int __b)
+{
+  return (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqshluq_n_s8 (int8x16_t __a, const int __b)
+{
+  return (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqshluq_n_s16 (int16x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqshluq_n_s32 (int32x4_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqshluq_n_s64 (int64x2_t __a, const int __b)
+{
+  return (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshll_n_s8 (int8x8_t __a, const int __b)
+{
+  return (int16x8_t)__builtin_neon_vshll_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshll_n_s16 (int16x4_t __a, const int __b)
+{
+  return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshll_n_s32 (int32x2_t __a, const int __b)
+{
+  return (int64x2_t)__builtin_neon_vshll_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshll_n_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshll_n_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vshll_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshll_n_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint64x2_t)__builtin_neon_vshll_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+  return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+  return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+  return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+  return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+  return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+  return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+  return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+  return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+  return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+  return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+  return (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+  return (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+  return (uint8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vsri_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+  return (uint64x1_t)__builtin_neon_vsri_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+  return (poly8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+  return (poly16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+  return (uint8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vsri_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+  return (poly8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+  return (poly16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+  return (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+  return (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+  return (uint8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vsli_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+  return (uint64x1_t)__builtin_neon_vsli_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+  return (poly8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+  return (poly16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+  return (uint8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vsli_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+  return (poly8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+  return (poly16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vabs_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vabsv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vabs_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vabsv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vabs_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vabsv2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vabs_f32 (float32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vabsv2sf (__a, 5);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vabsq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vabsv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabsq_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vabsv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabsq_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vabsv4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vabsq_f32 (float32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vabsv4sf (__a, 5);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqabs_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vqabsv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqabs_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vqabsv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqabs_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vqabsv2si (__a, 1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqabsq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vqabsv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqabsq_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vqabsv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqabsq_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vqabsv4si (__a, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vneg_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vnegv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vneg_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vnegv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vneg_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vnegv2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vneg_f32 (float32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vnegv2sf (__a, 5);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vnegq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vnegv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vnegq_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vnegv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vnegq_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vnegv4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vnegq_f32 (float32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vnegv4sf (__a, 5);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqneg_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vqnegv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqneg_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vqnegv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqneg_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vqnegv2si (__a, 1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqnegq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vqnegv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqnegq_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vqnegv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqnegq_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vqnegv4si (__a, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmvn_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vmvnv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmvn_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vmvnv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmvn_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vmvnv2si (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmvn_u8 (uint8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmvn_u16 (uint16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vmvnv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmvn_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vmvnv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vmvn_p8 (poly8x8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmvnq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vmvnv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmvnq_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vmvnv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmvnq_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vmvnv4si (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmvnq_u8 (uint8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmvnq_u16 (uint16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vmvnv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmvnq_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vmvnv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vmvnq_p8 (poly8x16_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcls_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vclsv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcls_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vclsv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcls_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vclsv2si (__a, 1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vclsq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vclsv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vclsq_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vclsv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vclsq_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vclsv4si (__a, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vclz_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vclzv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vclz_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vclzv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vclz_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vclzv2si (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclz_u8 (uint8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vclzv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclz_u16 (uint16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vclzv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclz_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vclzv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vclzq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vclzv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vclzq_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vclzv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vclzq_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vclzv4si (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vclzq_u8 (uint8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vclzv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vclzq_u16 (uint16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vclzv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vclzq_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vclzv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcnt_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vcntv8qi (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcnt_u8 (uint8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vcnt_p8 (poly8x8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vcntq_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vcntv16qi (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcntq_u8 (uint8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vcntq_p8 (poly8x16_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrecpe_f32 (float32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vrecpev2sf (__a, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrecpe_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vrecpev2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrecpeq_f32 (float32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vrecpev4sf (__a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrecpeq_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vrecpev4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrte_f32 (float32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vrsqrtev2sf (__a, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsqrte_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vrsqrtev2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrteq_f32 (float32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vrsqrtev4sf (__a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrsqrteq_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vrsqrtev4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+vget_lane_s8 (int8x8_t __a, const int __b)
+{
+  return (int8_t)__builtin_neon_vget_lanev8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vget_lane_s16 (int16x4_t __a, const int __b)
+{
+  return (int16_t)__builtin_neon_vget_lanev4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vget_lane_s32 (int32x2_t __a, const int __b)
+{
+  return (int32_t)__builtin_neon_vget_lanev2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vget_lane_f32 (float32x2_t __a, const int __b)
+{
+  return (float32_t)__builtin_neon_vget_lanev2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+vget_lane_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vget_lane_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vget_lane_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint32_t)__builtin_neon_vget_lanev2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+vget_lane_p8 (poly8x8_t __a, const int __b)
+{
+  return (poly8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 4);
+}
+
+__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+vget_lane_p16 (poly16x4_t __a, const int __b)
+{
+  return (poly16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 4);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vget_lane_s64 (int64x1_t __a, const int __b)
+{
+  return (int64_t)__builtin_neon_vget_lanedi (__a, __b, 1);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vget_lane_u64 (uint64x1_t __a, const int __b)
+{
+  return (uint64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+vgetq_lane_s8 (int8x16_t __a, const int __b)
+{
+  return (int8_t)__builtin_neon_vget_lanev16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vgetq_lane_s16 (int16x8_t __a, const int __b)
+{
+  return (int16_t)__builtin_neon_vget_lanev8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vgetq_lane_s32 (int32x4_t __a, const int __b)
+{
+  return (int32_t)__builtin_neon_vget_lanev4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vgetq_lane_f32 (float32x4_t __a, const int __b)
+{
+  return (float32_t)__builtin_neon_vget_lanev4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+vgetq_lane_u8 (uint8x16_t __a, const int __b)
+{
+  return (uint8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vgetq_lane_u16 (uint16x8_t __a, const int __b)
+{
+  return (uint16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vgetq_lane_u32 (uint32x4_t __a, const int __b)
+{
+  return (uint32_t)__builtin_neon_vget_lanev4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+vgetq_lane_p8 (poly8x16_t __a, const int __b)
+{
+  return (poly8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 4);
+}
+
+__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+vgetq_lane_p16 (poly16x8_t __a, const int __b)
+{
+  return (poly16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 4);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vgetq_lane_s64 (int64x2_t __a, const int __b)
+{
+  return (int64_t)__builtin_neon_vget_lanev2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vgetq_lane_u64 (uint64x2_t __a, const int __b)
+{
+  return (uint64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c)
+{
+  return (int8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
+{
+  return (float32x2_t)__builtin_neon_vset_lanev2sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c)
+{
+  return (uint8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c)
+{
+  return (poly8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c)
+{
+  return (poly16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c)
+{
+  return (int64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c)
+{
+  return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
+{
+  return (float32x4_t)__builtin_neon_vset_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c)
+{
+  return (uint8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c)
+{
+  return (poly8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c)
+{
+  return (poly16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcreate_s8 (uint64_t __a)
+{
+  return (int8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcreate_s16 (uint64_t __a)
+{
+  return (int16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcreate_s32 (uint64_t __a)
+{
+  return (int32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vcreate_s64 (uint64_t __a)
+{
+  return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcreate_f32 (uint64_t __a)
+{
+  return (float32x2_t)__builtin_neon_vcreatev2sf ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcreate_u8 (uint64_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcreate_u16 (uint64_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcreate_u32 (uint64_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcreate_u64 (uint64_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vcreate_p8 (uint64_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vcreate_p16 (uint64_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vdup_n_s8 (int8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vdup_n_s16 (int16_t __a)
+{
+  return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vdup_n_s32 (int32_t __a)
+{
+  return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vdup_n_f32 (float32_t __a)
+{
+  return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vdup_n_u8 (uint8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vdup_n_u16 (uint16_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vdup_n_u32 (uint32_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vdup_n_p8 (poly8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vdup_n_p16 (poly16_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vdup_n_s64 (int64_t __a)
+{
+  return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vdup_n_u64 (uint64_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vdupq_n_s8 (int8_t __a)
+{
+  return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vdupq_n_s16 (int16_t __a)
+{
+  return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vdupq_n_s32 (int32_t __a)
+{
+  return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vdupq_n_f32 (float32_t __a)
+{
+  return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vdupq_n_u8 (uint8_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vdupq_n_u16 (uint16_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vdupq_n_u32 (uint32_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vdupq_n_p8 (poly8_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vdupq_n_p16 (poly16_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vdupq_n_s64 (int64_t __a)
+{
+  return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vdupq_n_u64 (uint64_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmov_n_s8 (int8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmov_n_s16 (int16_t __a)
+{
+  return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmov_n_s32 (int32_t __a)
+{
+  return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmov_n_f32 (float32_t __a)
+{
+  return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmov_n_u8 (uint8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmov_n_u16 (uint16_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmov_n_u32 (uint32_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vmov_n_p8 (poly8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vmov_n_p16 (poly16_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vmov_n_s64 (int64_t __a)
+{
+  return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vmov_n_u64 (uint64_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmovq_n_s8 (int8_t __a)
+{
+  return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmovq_n_s16 (int16_t __a)
+{
+  return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmovq_n_s32 (int32_t __a)
+{
+  return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmovq_n_f32 (float32_t __a)
+{
+  return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmovq_n_u8 (uint8_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmovq_n_u16 (uint16_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmovq_n_u32 (uint32_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vmovq_n_p8 (poly8_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmovq_n_p16 (poly16_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmovq_n_s64 (int64_t __a)
+{
+  return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmovq_n_u64 (uint64_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vdup_lane_s8 (int8x8_t __a, const int __b)
+{
+  return (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vdup_lane_s16 (int16x4_t __a, const int __b)
+{
+  return (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vdup_lane_s32 (int32x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vdup_lane_f32 (float32x2_t __a, const int __b)
+{
+  return (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vdup_lane_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vdup_lane_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vdup_lane_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vdup_lanev2si ((int32x2_t) __a, __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vdup_lane_p8 (poly8x8_t __a, const int __b)
+{
+  return (poly8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vdup_lane_p16 (poly16x4_t __a, const int __b)
+{
+  return (poly16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vdup_lane_s64 (int64x1_t __a, const int __b)
+{
+  return (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vdup_lane_u64 (uint64x1_t __a, const int __b)
+{
+  return (uint64x1_t)__builtin_neon_vdup_lanedi ((int64x1_t) __a, __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vdupq_lane_s8 (int8x8_t __a, const int __b)
+{
+  return (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_s16 (int16x4_t __a, const int __b)
+{
+  return (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vdupq_lane_s32 (int32x2_t __a, const int __b)
+{
+  return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vdupq_lane_f32 (float32x2_t __a, const int __b)
+{
+  return (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vdupq_lane_u8 (uint8x8_t __a, const int __b)
+{
+  return (uint8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_u16 (uint16x4_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vdupq_lane_u32 (uint32x2_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vdup_lanev4si ((int32x2_t) __a, __b);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vdupq_lane_p8 (poly8x8_t __a, const int __b)
+{
+  return (poly8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_p16 (poly16x4_t __a, const int __b)
+{
+  return (poly16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vdupq_lane_s64 (int64x1_t __a, const int __b)
+{
+  return (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vdupq_lane_u64 (uint64x1_t __a, const int __b)
+{
+  return (uint64x2_t)__builtin_neon_vdup_lanev2di ((int64x1_t) __a, __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vcombine_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcombine_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcombine_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x4_t)__builtin_neon_vcombinev2si (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcombine_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcombine_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vcombinev2si ((int32x2_t) __a, (int32x2_t) __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vcombinedi ((int64x1_t) __a, (int64x1_t) __b);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (poly8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+  return (poly16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vget_high_s8 (int8x16_t __a)
+{
+  return (int8x8_t)__builtin_neon_vget_highv16qi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vget_high_s16 (int16x8_t __a)
+{
+  return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vget_high_s32 (int32x4_t __a)
+{
+  return (int32x2_t)__builtin_neon_vget_highv4si (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vget_high_s64 (int64x2_t __a)
+{
+  return (int64x1_t)__builtin_neon_vget_highv2di (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vget_high_f32 (float32x4_t __a)
+{
+  return (float32x2_t)__builtin_neon_vget_highv4sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vget_high_u8 (uint8x16_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vget_high_u16 (uint16x8_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vget_high_u32 (uint32x4_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vget_highv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vget_high_u64 (uint64x2_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vget_high_p8 (poly8x16_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vget_high_p16 (poly16x8_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vget_low_s8 (int8x16_t __a)
+{
+  return (int8x8_t)__builtin_neon_vget_lowv16qi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vget_low_s16 (int16x8_t __a)
+{
+  return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vget_low_s32 (int32x4_t __a)
+{
+  return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vget_low_s64 (int64x2_t __a)
+{
+  return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vget_low_f32 (float32x4_t __a)
+{
+  return (float32x2_t)__builtin_neon_vget_lowv4sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vget_low_u8 (uint8x16_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vget_low_u16 (uint16x8_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vget_low_u32 (uint32x4_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vget_low_u64 (uint64x2_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vget_low_p8 (poly8x16_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vget_low_p16 (poly16x8_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvt_s32_f32 (float32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vcvtv2sf (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_f32_s32 (int32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vcvtv2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_f32_u32 (uint32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vcvtv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvt_u32_f32 (float32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vcvtv2sf (__a, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtq_s32_f32 (float32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vcvtv4sf (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_f32_s32 (int32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vcvtv4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_f32_u32 (uint32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vcvtv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtq_u32_f32 (float32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vcvtv4sf (__a, 0);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvt_n_s32_f32 (float32x2_t __a, const int __b)
+{
+  return (int32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_s32 (int32x2_t __a, const int __b)
+{
+  return (float32x2_t)__builtin_neon_vcvt_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
+{
+  return (float32x2_t)__builtin_neon_vcvt_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvt_n_u32_f32 (float32x2_t __a, const int __b)
+{
+  return (uint32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
+{
+  return (int32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
+{
+  return (float32x4_t)__builtin_neon_vcvt_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
+{
+  return (float32x4_t)__builtin_neon_vcvt_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
+{
+  return (uint32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmovn_s16 (int16x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vmovnv8hi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmovn_s32 (int32x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vmovnv4si (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmovn_s64 (int64x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vmovnv2di (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmovn_u16 (uint16x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vmovnv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmovn_u32 (uint32x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vmovnv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmovn_u64 (uint64x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vmovnv2di ((int64x2_t) __a, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqmovn_s16 (int16x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vqmovnv8hi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqmovn_s32 (int32x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vqmovnv4si (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqmovn_s64 (int64x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vqmovnv2di (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqmovn_u16 (uint16x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vqmovnv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqmovn_u32 (uint32x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vqmovnv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqmovn_u64 (uint64x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vqmovnv2di ((int64x2_t) __a, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqmovun_s16 (int16x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqmovun_s32 (int32x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vqmovunv4si (__a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqmovun_s64 (int64x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vqmovunv2di (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmovl_s8 (int8x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vmovlv8qi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmovl_s16 (int16x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmovl_s32 (int32x2_t __a)
+{
+  return (int64x2_t)__builtin_neon_vmovlv2si (__a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmovl_u8 (uint8x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vmovlv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmovl_u16 (uint16x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vmovlv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmovl_u32 (uint32x2_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vmovlv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl1_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl1_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl1_p8 (poly8x8_t __a, uint8x8_t __b)
+{
+  return (poly8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl2_s8 (int8x8x2_t __a, int8x8_t __b)
+{
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+  return (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b)
+{
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+  return (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b)
+{
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+  return (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl3_s8 (int8x8x3_t __a, int8x8_t __b)
+{
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+  return (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b)
+{
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+  return (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b)
+{
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+  return (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl4_s8 (int8x8x4_t __a, int8x8_t __b)
+{
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+  return (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b)
+{
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+  return (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b)
+{
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+  return (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c)
+{
+  return (poly8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c)
+{
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  return (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c)
+{
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  return (uint8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c)
+{
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  return (poly8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c)
+{
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  return (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c)
+{
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  return (uint8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c)
+{
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  return (poly8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c)
+{
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  return (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c)
+{
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  return (uint8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c)
+{
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  return (poly8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c)
+{
+  return (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vmul_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vmul_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c)
+{
+  return (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vmul_lanev8hi ((int16x8_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vmul_lanev4si ((int32x4_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
+{
+  return (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+  return (uint16x4_t)__builtin_neon_vmla_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+  return (uint32x2_t)__builtin_neon_vmla_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+{
+  return (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+{
+  return (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
+{
+  return (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
+{
+  return (uint16x8_t)__builtin_neon_vmla_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
+{
+  return (uint32x4_t)__builtin_neon_vmla_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return (int64x2_t)__builtin_neon_vmlal_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+  return (uint32x4_t)__builtin_neon_vmlal_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+  return (uint64x2_t)__builtin_neon_vmlal_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
+{
+  return (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+  return (uint16x4_t)__builtin_neon_vmls_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+  return (uint32x2_t)__builtin_neon_vmls_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+{
+  return (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+{
+  return (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
+{
+  return (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
+{
+  return (uint16x8_t)__builtin_neon_vmls_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
+{
+  return (uint32x4_t)__builtin_neon_vmls_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return (int32x4_t)__builtin_neon_vmlsl_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return (int64x2_t)__builtin_neon_vmlsl_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+  return (uint32x4_t)__builtin_neon_vmlsl_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+  return (uint64x2_t)__builtin_neon_vmlsl_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+  return (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+  return (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vmull_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vmull_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vmull_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vmull_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return (int32x4_t)__builtin_neon_vmull_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return (int64x2_t)__builtin_neon_vmull_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vmull_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vmull_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+{
+  return (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+{
+  return (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+  return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+  return (uint16x4_t)__builtin_neon_vmla_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+  return (uint32x2_t)__builtin_neon_vmla_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+{
+  return (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+  return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vmla_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmla_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+  return (int64x2_t)__builtin_neon_vmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmlal_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+  return (uint64x2_t)__builtin_neon_vmlal_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+  return (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+  return (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+{
+  return (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+{
+  return (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+  return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+  return (uint16x4_t)__builtin_neon_vmls_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+  return (uint32x2_t)__builtin_neon_vmls_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+{
+  return (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+  return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vmls_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmls_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+  return (int32x4_t)__builtin_neon_vmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+  return (int64x2_t)__builtin_neon_vmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vmlsl_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+  return (uint64x2_t)__builtin_neon_vmlsl_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+  return (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+  return (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vext_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+  return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vext_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vext_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vext_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+  return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vext_f32 (float32x2_t __a, float32x2_t __b, const int __c)
+{
+  return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+  return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+  return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+  return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+  return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+  return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+  return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+  return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev64_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrev64_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrev64_s32 (int32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrev64_f32 (float32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev64_u8 (uint8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrev64_u16 (uint16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrev64_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev64_p8 (poly8x8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vrev64_p16 (poly16x4_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev64q_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrev64q_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrev64q_s32 (int32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrev64q_f32 (float32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev64q_u8 (uint8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrev64q_u16 (uint16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrev64q_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev64q_p8 (poly8x16_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vrev64q_p16 (poly16x8_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev32_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrev32_s16 (int16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev32_u8 (uint8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrev32_u16 (uint16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev32_p8 (poly8x8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vrev32_p16 (poly16x4_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev32q_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrev32q_s16 (int16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev32q_u8 (uint8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrev32q_u16 (uint16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev32q_p8 (poly8x16_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vrev32q_p16 (poly16x8_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev16_s8 (int8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev16_u8 (uint8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev16_p8 (poly8x8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev16q_s8 (int8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev16q_u8 (uint8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev16q_p8 (poly8x16_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+  return (int8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return (int16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return (int32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
+{
+  return (int64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vbslv2sf ((int32x2_t) __a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+  return (uint8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+  return (uint16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+  return (uint32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
+{
+  return (uint64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, (int64x1_t) __b, (int64x1_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
+{
+  return (poly8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
+{
+  return (poly16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+  return (int8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+  return (int64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vbslv4sf ((int32x4_t) __a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+  return (uint8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+  return (uint64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
+{
+  return (poly8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
+{
+  return (poly16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vtrn_s8 (int8x8_t __a, int8x8_t __b)
+{
+  int8x8x2_t __rv;
+  __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vtrn_s16 (int16x4_t __a, int16x4_t __b)
+{
+  int16x4x2_t __rv;
+  __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vtrn_s32 (int32x2_t __a, int32x2_t __b)
+{
+  int32x2x2_t __rv;
+  __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vtrn_f32 (float32x2_t __a, float32x2_t __b)
+{
+  float32x2x2_t __rv;
+  __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8x2_t __rv;
+  __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4x2_t __rv;
+  __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2x2_t __rv;
+  __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  poly8x8x2_t __rv;
+  __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+  poly16x4x2_t __rv;
+  __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  int8x16x2_t __rv;
+  __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  int16x8x2_t __rv;
+  __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  int32x4x2_t __rv;
+  __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  float32x4x2_t __rv;
+  __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16x2_t __rv;
+  __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8x2_t __rv;
+  __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4x2_t __rv;
+  __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  poly8x16x2_t __rv;
+  __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+  poly16x8x2_t __rv;
+  __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vzip_s8 (int8x8_t __a, int8x8_t __b)
+{
+  int8x8x2_t __rv;
+  __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vzip_s16 (int16x4_t __a, int16x4_t __b)
+{
+  int16x4x2_t __rv;
+  __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vzip_s32 (int32x2_t __a, int32x2_t __b)
+{
+  int32x2x2_t __rv;
+  __builtin_neon_vzipv2si (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vzip_f32 (float32x2_t __a, float32x2_t __b)
+{
+  float32x2x2_t __rv;
+  __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vzip_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8x2_t __rv;
+  __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vzip_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4x2_t __rv;
+  __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vzip_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2x2_t __rv;
+  __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vzip_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  poly8x8x2_t __rv;
+  __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vzip_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+  poly16x4x2_t __rv;
+  __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vzipq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  int8x16x2_t __rv;
+  __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vzipq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  int16x8x2_t __rv;
+  __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vzipq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  int32x4x2_t __rv;
+  __builtin_neon_vzipv4si (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vzipq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  float32x4x2_t __rv;
+  __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16x2_t __rv;
+  __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8x2_t __rv;
+  __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4x2_t __rv;
+  __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  poly8x16x2_t __rv;
+  __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+  poly16x8x2_t __rv;
+  __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vuzp_s8 (int8x8_t __a, int8x8_t __b)
+{
+  int8x8x2_t __rv;
+  __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vuzp_s16 (int16x4_t __a, int16x4_t __b)
+{
+  int16x4x2_t __rv;
+  __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vuzp_s32 (int32x2_t __a, int32x2_t __b)
+{
+  int32x2x2_t __rv;
+  __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vuzp_f32 (float32x2_t __a, float32x2_t __b)
+{
+  float32x2x2_t __rv;
+  __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8x2_t __rv;
+  __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4x2_t __rv;
+  __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2x2_t __rv;
+  __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  poly8x8x2_t __rv;
+  __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+  poly16x4x2_t __rv;
+  __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vuzpq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  int8x16x2_t __rv;
+  __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vuzpq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  int16x8x2_t __rv;
+  __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vuzpq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  int32x4x2_t __rv;
+  __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vuzpq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  float32x4x2_t __rv;
+  __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b);
+  return __rv;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16x2_t __rv;
+  __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8x2_t __rv;
+  __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4x2_t __rv;
+  __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  poly8x16x2_t __rv;
+  __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+  poly16x8x2_t __rv;
+  __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  return __rv;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_s8 (const int8_t * __a)
+{
+  return (int8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_s16 (const int16_t * __a)
+{
+  return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_s32 (const int32_t * __a)
+{
+  return (int32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_s64 (const int64_t * __a)
+{
+  return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_f32 (const float32_t * __a)
+{
+  return (float32x2_t)__builtin_neon_vld1v2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_u8 (const uint8_t * __a)
+{
+  return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_u16 (const uint16_t * __a)
+{
+  return (uint16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_u32 (const uint32_t * __a)
+{
+  return (uint32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_u64 (const uint64_t * __a)
+{
+  return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_p8 (const poly8_t * __a)
+{
+  return (poly8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_p16 (const poly16_t * __a)
+{
+  return (poly16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_s8 (const int8_t * __a)
+{
+  return (int8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_s16 (const int16_t * __a)
+{
+  return (int16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_s32 (const int32_t * __a)
+{
+  return (int32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_s64 (const int64_t * __a)
+{
+  return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_f32 (const float32_t * __a)
+{
+  return (float32x4_t)__builtin_neon_vld1v4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_u8 (const uint8_t * __a)
+{
+  return (uint8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_u16 (const uint16_t * __a)
+{
+  return (uint16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_u32 (const uint32_t * __a)
+{
+  return (uint32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_u64 (const uint64_t * __a)
+{
+  return (uint64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_p8 (const poly8_t * __a)
+{
+  return (poly8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_p16 (const poly16_t * __a)
+{
+  return (poly16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_lane_s8 (const int8_t * __a, int8x8_t __b, const int __c)
+{
+  return (int8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_lane_s16 (const int16_t * __a, int16x4_t __b, const int __c)
+{
+  return (int16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
+{
+  return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
+{
+  return (float32x2_t)__builtin_neon_vld1_lanev2sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_lane_u8 (const uint8_t * __a, uint8x8_t __b, const int __c)
+{
+  return (uint8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_lane_u16 (const uint16_t * __a, uint16x4_t __b, const int __c)
+{
+  return (uint16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_lane_u32 (const uint32_t * __a, uint32x2_t __b, const int __c)
+{
+  return (uint32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_lane_p8 (const poly8_t * __a, poly8x8_t __b, const int __c)
+{
+  return (poly8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
+{
+  return (poly16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_lane_s64 (const int64_t * __a, int64x1_t __b, const int __c)
+{
+  return (int64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_lane_u64 (const uint64_t * __a, uint64x1_t __b, const int __c)
+{
+  return (uint64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_s8 (const int8_t * __a, int8x16_t __b, const int __c)
+{
+  return (int8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_s16 (const int16_t * __a, int16x8_t __b, const int __c)
+{
+  return (int16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
+{
+  return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
+{
+  return (float32x4_t)__builtin_neon_vld1_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_u8 (const uint8_t * __a, uint8x16_t __b, const int __c)
+{
+  return (uint8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_u16 (const uint16_t * __a, uint16x8_t __b, const int __c)
+{
+  return (uint16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_u32 (const uint32_t * __a, uint32x4_t __b, const int __c)
+{
+  return (uint32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_p8 (const poly8_t * __a, poly8x16_t __b, const int __c)
+{
+  return (poly8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
+{
+  return (poly16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_s64 (const int64_t * __a, int64x2_t __b, const int __c)
+{
+  return (int64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_u64 (const uint64_t * __a, uint64x2_t __b, const int __c)
+{
+  return (uint64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_dup_s8 (const int8_t * __a)
+{
+  return (int8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_dup_s16 (const int16_t * __a)
+{
+  return (int16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_dup_s32 (const int32_t * __a)
+{
+  return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_dup_f32 (const float32_t * __a)
+{
+  return (float32x2_t)__builtin_neon_vld1_dupv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_dup_u8 (const uint8_t * __a)
+{
+  return (uint8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_dup_u16 (const uint16_t * __a)
+{
+  return (uint16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_dup_u32 (const uint32_t * __a)
+{
+  return (uint32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_dup_p8 (const poly8_t * __a)
+{
+  return (poly8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_dup_p16 (const poly16_t * __a)
+{
+  return (poly16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_dup_s64 (const int64_t * __a)
+{
+  return (int64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_dup_u64 (const uint64_t * __a)
+{
+  return (uint64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_dup_s8 (const int8_t * __a)
+{
+  return (int8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_s16 (const int16_t * __a)
+{
+  return (int16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_dup_s32 (const int32_t * __a)
+{
+  return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_dup_f32 (const float32_t * __a)
+{
+  return (float32x4_t)__builtin_neon_vld1_dupv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_dup_u8 (const uint8_t * __a)
+{
+  return (uint8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_u16 (const uint16_t * __a)
+{
+  return (uint16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_dup_u32 (const uint32_t * __a)
+{
+  return (uint32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_dup_p8 (const poly8_t * __a)
+{
+  return (poly8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_p16 (const poly16_t * __a)
+{
+  return (poly16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_dup_s64 (const int64_t * __a)
+{
+  return (int64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_dup_u64 (const uint64_t * __a)
+{
+  return (uint64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s8 (int8_t * __a, int8x8_t __b)
+{
+  __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s16 (int16_t * __a, int16x4_t __b)
+{
+  __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s32 (int32_t * __a, int32x2_t __b)
+{
+  __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s64 (int64_t * __a, int64x1_t __b)
+{
+  __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f32 (float32_t * __a, float32x2_t __b)
+{
+  __builtin_neon_vst1v2sf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u8 (uint8_t * __a, uint8x8_t __b)
+{
+  __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u16 (uint16_t * __a, uint16x4_t __b)
+{
+  __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u32 (uint32_t * __a, uint32x2_t __b)
+{
+  __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, (int32x2_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u64 (uint64_t * __a, uint64x1_t __b)
+{
+  __builtin_neon_vst1di ((__builtin_neon_di *) __a, (int64x1_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p8 (poly8_t * __a, poly8x8_t __b)
+{
+  __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p16 (poly16_t * __a, poly16x4_t __b)
+{
+  __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s8 (int8_t * __a, int8x16_t __b)
+{
+  __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s16 (int16_t * __a, int16x8_t __b)
+{
+  __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s32 (int32_t * __a, int32x4_t __b)
+{
+  __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s64 (int64_t * __a, int64x2_t __b)
+{
+  __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f32 (float32_t * __a, float32x4_t __b)
+{
+  __builtin_neon_vst1v4sf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u8 (uint8_t * __a, uint8x16_t __b)
+{
+  __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u16 (uint16_t * __a, uint16x8_t __b)
+{
+  __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u32 (uint32_t * __a, uint32x4_t __b)
+{
+  __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, (int32x4_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u64 (uint64_t * __a, uint64x2_t __b)
+{
+  __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p8 (poly8_t * __a, poly8x16_t __b)
+{
+  __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p16 (poly16_t * __a, poly16x8_t __b)
+{
+  __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s16 (int16_t * __a, int16x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev2sf (__a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u8 (uint8_t * __a, uint8x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u16 (uint16_t * __a, uint16x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u32 (uint32_t * __a, uint32x2_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_p8 (poly8_t * __a, poly8x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s64 (int64_t * __a, int64x1_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u64 (uint64_t * __a, uint64x1_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s8 (int8_t * __a, int8x16_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s16 (int16_t * __a, int16x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u8 (uint8_t * __a, uint8x16_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u16 (uint16_t * __a, uint16x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u32 (uint32_t * __a, uint32x4_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_p8 (poly8_t * __a, poly8x16_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s64 (int64_t * __a, int64x2_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u64 (uint64_t * __a, uint64x2_t __b, const int __c)
+{
+  __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vld2_s8 (const int8_t * __a)
+{
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vld2_s16 (const int16_t * __a)
+{
+  union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vld2_s32 (const int32_t * __a)
+{
+  union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vld2_f32 (const float32_t * __a)
+{
+  union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v2sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vld2_u8 (const uint8_t * __a)
+{
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vld2_u16 (const uint16_t * __a)
+{
+  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vld2_u32 (const uint32_t * __a)
+{
+  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vld2_p8 (const poly8_t * __a)
+{
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vld2_p16 (const poly16_t * __a)
+{
+  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
+vld2_s64 (const int64_t * __a)
+{
+  union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
+vld2_u64 (const uint64_t * __a)
+{
+  union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vld2q_s8 (const int8_t * __a)
+{
+  union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vld2q_s16 (const int16_t * __a)
+{
+  union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vld2q_s32 (const int32_t * __a)
+{
+  union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vld2q_f32 (const float32_t * __a)
+{
+  union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vld2q_u8 (const uint8_t * __a)
+{
+  union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vld2q_u16 (const uint16_t * __a)
+{
+  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vld2q_u32 (const uint32_t * __a)
+{
+  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vld2q_p8 (const poly8_t * __a)
+{
+  union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vld2q_p16 (const poly16_t * __a)
+{
+  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
+{
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
+{
+  union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
+{
+  union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
+{
+  union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev2sf (__a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
+{
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
+{
+  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
+{
+  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
+{
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
+{
+  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
+{
+  union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
+{
+  union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
+{
+  union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4sf (__a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
+{
+  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
+{
+  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
+{
+  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vld2_dup_s8 (const int8_t * __a)
+{
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_s16 (const int16_t * __a)
+{
+  union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vld2_dup_s32 (const int32_t * __a)
+{
+  union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vld2_dup_f32 (const float32_t * __a)
+{
+  union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv2sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vld2_dup_u8 (const uint8_t * __a)
+{
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_u16 (const uint16_t * __a)
+{
+  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vld2_dup_u32 (const uint32_t * __a)
+{
+  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vld2_dup_p8 (const poly8_t * __a)
+{
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_p16 (const poly16_t * __a)
+{
+  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
+vld2_dup_s64 (const int64_t * __a)
+{
+  union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
+vld2_dup_u64 (const uint64_t * __a)
+{
+  union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+  __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s8 (int8_t * __a, int8x8x2_t __b)
+{
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s16 (int16_t * __a, int16x4x2_t __b)
+{
+  union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s32 (int32_t * __a, int32x2x2_t __b)
+{
+  union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_f32 (float32_t * __a, float32x2x2_t __b)
+{
+  union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v2sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
+{
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u16 (uint16_t * __a, uint16x4x2_t __b)
+{
+  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u32 (uint32_t * __a, uint32x2x2_t __b)
+{
+  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_p8 (poly8_t * __a, poly8x8x2_t __b)
+{
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
+{
+  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s64 (int64_t * __a, int64x1x2_t __b)
+{
+  union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u64 (uint64_t * __a, uint64x1x2_t __b)
+{
+  union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_s8 (int8_t * __a, int8x16x2_t __b)
+{
+  union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_s16 (int16_t * __a, int16x8x2_t __b)
+{
+  union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_s32 (int32_t * __a, int32x4x2_t __b)
+{
+  union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_f32 (float32_t * __a, float32x4x2_t __b)
+{
+  union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v4sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_u8 (uint8_t * __a, uint8x16x2_t __b)
+{
+  union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_u16 (uint16_t * __a, uint16x8x2_t __b)
+{
+  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_u32 (uint32_t * __a, uint32x4x2_t __b)
+{
+  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_p8 (poly8_t * __a, poly8x16x2_t __b)
+{
+  union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_p16 (poly16_t * __a, poly16x8x2_t __b)
+{
+  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_s8 (int8_t * __a, int8x8x2_t __b, const int __c)
+{
+  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_s16 (int16_t * __a, int16x4x2_t __b, const int __c)
+{
+  union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
+{
+  union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
+{
+  union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev2sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_u8 (uint8_t * __a, uint8x8x2_t __b, const int __c)
+{
+  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_u16 (uint16_t * __a, uint16x4x2_t __b, const int __c)
+{
+  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_u32 (uint32_t * __a, uint32x2x2_t __b, const int __c)
+{
+  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_p8 (poly8_t * __a, poly8x8x2_t __b, const int __c)
+{
+  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_p16 (poly16_t * __a, poly16x4x2_t __b, const int __c)
+{
+  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_s16 (int16_t * __a, int16x8x2_t __b, const int __c)
+{
+  union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
+{
+  union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
+{
+  union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_u16 (uint16_t * __a, uint16x8x2_t __b, const int __c)
+{
+  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_u32 (uint32_t * __a, uint32x4x2_t __b, const int __c)
+{
+  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_p16 (poly16_t * __a, poly16x8x2_t __b, const int __c)
+{
+  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+vld3_s8 (const int8_t * __a)
+{
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+vld3_s16 (const int16_t * __a)
+{
+  union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+vld3_s32 (const int32_t * __a)
+{
+  union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+vld3_f32 (const float32_t * __a)
+{
+  union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v2sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+vld3_u8 (const uint8_t * __a)
+{
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+vld3_u16 (const uint16_t * __a)
+{
+  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+vld3_u32 (const uint32_t * __a)
+{
+  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+vld3_p8 (const poly8_t * __a)
+{
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+vld3_p16 (const poly16_t * __a)
+{
+  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
+vld3_s64 (const int64_t * __a)
+{
+  union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
+vld3_u64 (const uint64_t * __a)
+{
+  union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
+vld3q_s8 (const int8_t * __a)
+{
+  union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
+vld3q_s16 (const int16_t * __a)
+{
+  union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
+vld3q_s32 (const int32_t * __a)
+{
+  union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
+vld3q_f32 (const float32_t * __a)
+{
+  union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
+vld3q_u8 (const uint8_t * __a)
+{
+  union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
+vld3q_u16 (const uint16_t * __a)
+{
+  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
+vld3q_u32 (const uint32_t * __a)
+{
+  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
+vld3q_p8 (const poly8_t * __a)
+{
+  union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
+vld3q_p16 (const poly16_t * __a)
+{
+  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
+{
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
+{
+  union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
+{
+  union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
+{
+  union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev2sf (__a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
+{
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
+{
+  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
+{
+  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
+{
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
+{
+  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
+{
+  union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
+vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
+{
+  union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
+vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
+{
+  union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4sf (__a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
+{
+  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
+vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
+{
+  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
+{
+  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+vld3_dup_s8 (const int8_t * __a)
+{
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_s16 (const int16_t * __a)
+{
+  union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+vld3_dup_s32 (const int32_t * __a)
+{
+  union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+vld3_dup_f32 (const float32_t * __a)
+{
+  union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv2sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+vld3_dup_u8 (const uint8_t * __a)
+{
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_u16 (const uint16_t * __a)
+{
+  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+vld3_dup_u32 (const uint32_t * __a)
+{
+  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+vld3_dup_p8 (const poly8_t * __a)
+{
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_p16 (const poly16_t * __a)
+{
+  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
+vld3_dup_s64 (const int64_t * __a)
+{
+  union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
+vld3_dup_u64 (const uint64_t * __a)
+{
+  union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+  __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s8 (int8_t * __a, int8x8x3_t __b)
+{
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s16 (int16_t * __a, int16x4x3_t __b)
+{
+  union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s32 (int32_t * __a, int32x2x3_t __b)
+{
+  union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_f32 (float32_t * __a, float32x2x3_t __b)
+{
+  union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v2sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u8 (uint8_t * __a, uint8x8x3_t __b)
+{
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u16 (uint16_t * __a, uint16x4x3_t __b)
+{
+  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u32 (uint32_t * __a, uint32x2x3_t __b)
+{
+  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_p8 (poly8_t * __a, poly8x8x3_t __b)
+{
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
+{
+  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s64 (int64_t * __a, int64x1x3_t __b)
+{
+  union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u64 (uint64_t * __a, uint64x1x3_t __b)
+{
+  union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_s8 (int8_t * __a, int8x16x3_t __b)
+{
+  union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_s16 (int16_t * __a, int16x8x3_t __b)
+{
+  union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_s32 (int32_t * __a, int32x4x3_t __b)
+{
+  union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_f32 (float32_t * __a, float32x4x3_t __b)
+{
+  union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v4sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_u8 (uint8_t * __a, uint8x16x3_t __b)
+{
+  union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_u16 (uint16_t * __a, uint16x8x3_t __b)
+{
+  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_u32 (uint32_t * __a, uint32x4x3_t __b)
+{
+  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_p8 (poly8_t * __a, poly8x16x3_t __b)
+{
+  union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_p16 (poly16_t * __a, poly16x8x3_t __b)
+{
+  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_s8 (int8_t * __a, int8x8x3_t __b, const int __c)
+{
+  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_s16 (int16_t * __a, int16x4x3_t __b, const int __c)
+{
+  union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
+{
+  union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
+{
+  union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev2sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_u8 (uint8_t * __a, uint8x8x3_t __b, const int __c)
+{
+  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_u16 (uint16_t * __a, uint16x4x3_t __b, const int __c)
+{
+  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_u32 (uint32_t * __a, uint32x2x3_t __b, const int __c)
+{
+  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_p8 (poly8_t * __a, poly8x8x3_t __b, const int __c)
+{
+  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_p16 (poly16_t * __a, poly16x4x3_t __b, const int __c)
+{
+  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_s16 (int16_t * __a, int16x8x3_t __b, const int __c)
+{
+  union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
+{
+  union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
+{
+  union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_u16 (uint16_t * __a, uint16x8x3_t __b, const int __c)
+{
+  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_u32 (uint32_t * __a, uint32x4x3_t __b, const int __c)
+{
+  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_p16 (poly16_t * __a, poly16x8x3_t __b, const int __c)
+{
+  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+vld4_s8 (const int8_t * __a)
+{
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+vld4_s16 (const int16_t * __a)
+{
+  union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+vld4_s32 (const int32_t * __a)
+{
+  union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+vld4_f32 (const float32_t * __a)
+{
+  union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v2sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+vld4_u8 (const uint8_t * __a)
+{
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+vld4_u16 (const uint16_t * __a)
+{
+  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+vld4_u32 (const uint32_t * __a)
+{
+  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+vld4_p8 (const poly8_t * __a)
+{
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+vld4_p16 (const poly16_t * __a)
+{
+  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
+vld4_s64 (const int64_t * __a)
+{
+  union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
+vld4_u64 (const uint64_t * __a)
+{
+  union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
+vld4q_s8 (const int8_t * __a)
+{
+  union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
+vld4q_s16 (const int16_t * __a)
+{
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
+vld4q_s32 (const int32_t * __a)
+{
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
+vld4q_f32 (const float32_t * __a)
+{
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
+vld4q_u8 (const uint8_t * __a)
+{
+  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
+vld4q_u16 (const uint16_t * __a)
+{
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
+vld4q_u32 (const uint32_t * __a)
+{
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
+vld4q_p8 (const poly8_t * __a)
+{
+  union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
+vld4q_p16 (const poly16_t * __a)
+{
+  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
+{
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
+{
+  union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
+{
+  union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
+{
+  union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev2sf (__a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
+{
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
+{
+  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
+{
+  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
+{
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
+{
+  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
+{
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
+vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
+{
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
+vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
+{
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4sf (__a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
+{
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
+vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
+{
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
+{
+  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+  return __rv.__i;
+}
+
+__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+vld4_dup_s8 (const int8_t * __a)
+{
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_s16 (const int16_t * __a)
+{
+  union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+vld4_dup_s32 (const int32_t * __a)
+{
+  union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+vld4_dup_f32 (const float32_t * __a)
+{
+  union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv2sf (__a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+vld4_dup_u8 (const uint8_t * __a)
+{
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_u16 (const uint16_t * __a)
+{
+  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+vld4_dup_u32 (const uint32_t * __a)
+{
+  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+vld4_dup_p8 (const poly8_t * __a)
+{
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_p16 (const poly16_t * __a)
+{
+  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
+vld4_dup_s64 (const int64_t * __a)
+{
+  union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
+vld4_dup_u64 (const uint64_t * __a)
+{
+  union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+  __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a);
+  return __rv.__i;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s8 (int8_t * __a, int8x8x4_t __b)
+{
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s16 (int16_t * __a, int16x4x4_t __b)
+{
+  union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s32 (int32_t * __a, int32x2x4_t __b)
+{
+  union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_f32 (float32_t * __a, float32x2x4_t __b)
+{
+  union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v2sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
+{
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u16 (uint16_t * __a, uint16x4x4_t __b)
+{
+  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u32 (uint32_t * __a, uint32x2x4_t __b)
+{
+  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_p8 (poly8_t * __a, poly8x8x4_t __b)
+{
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
+{
+  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s64 (int64_t * __a, int64x1x4_t __b)
+{
+  union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u64 (uint64_t * __a, uint64x1x4_t __b)
+{
+  union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_s8 (int8_t * __a, int8x16x4_t __b)
+{
+  union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_s16 (int16_t * __a, int16x8x4_t __b)
+{
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_s32 (int32_t * __a, int32x4x4_t __b)
+{
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_f32 (float32_t * __a, float32x4x4_t __b)
+{
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v4sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_u8 (uint8_t * __a, uint8x16x4_t __b)
+{
+  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_u16 (uint16_t * __a, uint16x8x4_t __b)
+{
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_u32 (uint32_t * __a, uint32x4x4_t __b)
+{
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_p8 (poly8_t * __a, poly8x16x4_t __b)
+{
+  union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_p16 (poly16_t * __a, poly16x8x4_t __b)
+{
+  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_s8 (int8_t * __a, int8x8x4_t __b, const int __c)
+{
+  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_s16 (int16_t * __a, int16x4x4_t __b, const int __c)
+{
+  union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
+{
+  union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
+{
+  union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev2sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_u8 (uint8_t * __a, uint8x8x4_t __b, const int __c)
+{
+  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_u16 (uint16_t * __a, uint16x4x4_t __b, const int __c)
+{
+  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_u32 (uint32_t * __a, uint32x2x4_t __b, const int __c)
+{
+  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_p8 (poly8_t * __a, poly8x8x4_t __b, const int __c)
+{
+  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_p16 (poly16_t * __a, poly16x4x4_t __b, const int __c)
+{
+  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_s16 (int16_t * __a, int16x8x4_t __b, const int __c)
+{
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
+{
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
+{
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_u16 (uint16_t * __a, uint16x8x4_t __b, const int __c)
+{
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_u32 (uint32_t * __a, uint32x4x4_t __b, const int __c)
+{
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_p16 (poly16_t * __a, poly16x8x4_t __b, const int __c)
+{
+  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vand_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vandv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vand_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vandv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vand_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vandv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vand_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vanddi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vand_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vandv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vand_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vandv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vand_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vandv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vand_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vanddi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vandq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vandv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vandq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vandv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vandq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vandv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vandq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vandv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vandq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vandv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vandq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vandv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vandq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vandv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vandq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vandv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vorr_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vorrv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vorr_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vorrv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vorr_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vorrv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vorr_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vorrdi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vorr_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vorrv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vorr_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vorrv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vorr_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vorrv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vorr_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vorrdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vorrq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vorrv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vorrq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vorrv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vorrq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vorrv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vorrq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vorrv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vorrv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vorrv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vorrv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vorrv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+veor_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_veorv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+veor_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_veorv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+veor_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_veorv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+veor_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_veordi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+veor_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_veorv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+veor_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_veorv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+veor_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_veorv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+veor_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_veordi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+veorq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_veorv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+veorq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_veorv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+veorq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_veorv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+veorq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_veorv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+veorq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_veorv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+veorq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_veorv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+veorq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_veorv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+veorq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_veorv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vbic_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vbicv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vbic_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vbicv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vbic_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vbicv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vbic_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vbicdi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vbic_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vbicv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vbic_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vbicv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vbic_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vbicv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vbic_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vbicdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vbicq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vbicv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vbicq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vbicv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vbicq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vbicv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vbicq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vbicv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vbicv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vbicv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vbicv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vbicv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vorn_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_neon_vornv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vorn_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t)__builtin_neon_vornv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vorn_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t)__builtin_neon_vornv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vorn_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vorndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vorn_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t)__builtin_neon_vornv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vorn_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vornv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vorn_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t)__builtin_neon_vornv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vorn_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t)__builtin_neon_vorndi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vornq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t)__builtin_neon_vornv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vornq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t)__builtin_neon_vornv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vornq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t)__builtin_neon_vornv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vornq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int64x2_t)__builtin_neon_vornv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vornq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t)__builtin_neon_vornv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vornq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vornv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vornq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t)__builtin_neon_vornv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vornq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t)__builtin_neon_vornv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s8 (int8x8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s16 (int16x4_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s32 (int32x2_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s64 (int64x1_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_f32 (float32x2_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u8 (uint8x8_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u16 (uint16x4_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u32 (uint32x2_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u64 (uint64x1_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_p16 (poly16x4_t __a)
+{
+  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s8 (int8x16_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s16 (int16x8_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s32 (int32x4_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s64 (int64x2_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f32 (float32x4_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u8 (uint8x16_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u16 (uint16x8_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u32 (uint32x4_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u64 (uint64x2_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_p16 (poly16x8_t __a)
+{
+  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s8 (int8x8_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s16 (int16x4_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s32 (int32x2_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s64 (int64x1_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_f32 (float32x2_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u8 (uint8x8_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u16 (uint16x4_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u32 (uint32x2_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u64 (uint64x1_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_p8 (poly8x8_t __a)
+{
+  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s8 (int8x16_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s16 (int16x8_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s32 (int32x4_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s64 (int64x2_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_f32 (float32x4_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u8 (uint8x16_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u16 (uint16x8_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u32 (uint32x4_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u64 (uint64x2_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_p8 (poly8x16_t __a)
+{
+  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s8 (int8x8_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s16 (int16x4_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s32 (int32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s64 (int64x1_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfdi (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u8 (uint8x8_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u16 (uint16x4_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u32 (uint32x2_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u64 (uint64x1_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfdi ((int64x1_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_p8 (poly8x8_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_p16 (poly16x4_t __a)
+{
+  return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s8 (int8x16_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s16 (int16x8_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s32 (int32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s64 (int64x2_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u8 (uint8x16_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u16 (uint16x8_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u32 (uint32x4_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u64 (uint64x2_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_p8 (poly8x16_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_p16 (poly16x8_t __a)
+{
+  return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_s8 (int8x8_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_s16 (int16x4_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_s32 (int32x2_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_f32 (float32x2_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u8 (uint8x8_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u16 (uint16x4_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u32 (uint32x2_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u64 (uint64x1_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_p8 (poly8x8_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_p16 (poly16x4_t __a)
+{
+  return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_s8 (int8x16_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_s16 (int16x8_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_s32 (int32x4_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f32 (float32x4_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u8 (uint8x16_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u16 (uint16x8_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u32 (uint32x4_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u64 (uint64x2_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_p8 (poly8x16_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_p16 (poly16x8_t __a)
+{
+  return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s8 (int8x8_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s16 (int16x4_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s32 (int32x2_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s64 (int64x1_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdidi (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_f32 (float32x2_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_u8 (uint8x8_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_u16 (uint16x4_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_u32 (uint32x2_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_p8 (poly8x8_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_p16 (poly16x4_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s8 (int8x16_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s16 (int16x8_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s32 (int32x4_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s64 (int64x2_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div2di (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f32 (float32x4_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_u8 (uint8x16_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_u16 (uint16x8_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_u32 (uint32x4_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_p8 (poly8x16_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_p16 (poly16x8_t __a)
+{
+  return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_s16 (int16x4_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_s32 (int32x2_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_s64 (int64x1_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_f32 (float32x2_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u8 (uint8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u16 (uint16x4_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u32 (uint32x2_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u64 (uint64x1_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_p8 (poly8x8_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_p16 (poly16x4_t __a)
+{
+  return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_s16 (int16x8_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_s32 (int32x4_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_s64 (int64x2_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f32 (float32x4_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u8 (uint8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u16 (uint16x8_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u32 (uint32x4_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u64 (uint64x2_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_p8 (poly8x16_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_p16 (poly16x8_t __a)
+{
+  return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_s8 (int8x8_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_s32 (int32x2_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_s64 (int64x1_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_f32 (float32x2_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u8 (uint8x8_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u16 (uint16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u32 (uint32x2_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u64 (uint64x1_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_p8 (poly8x8_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_p16 (poly16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_s8 (int8x16_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_s32 (int32x4_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_s64 (int64x2_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f32 (float32x4_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u8 (uint8x16_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u16 (uint16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u32 (uint32x4_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u64 (uint64x2_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_p8 (poly8x16_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_p16 (poly16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_s8 (int8x8_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_s16 (int16x4_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_s64 (int64x1_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_f32 (float32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u8 (uint8x8_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u16 (uint16x4_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u32 (uint32x2_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u64 (uint64x1_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_p8 (poly8x8_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_p16 (poly16x4_t __a)
+{
+  return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_s8 (int8x16_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_s16 (int16x8_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_s64 (int64x2_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f32 (float32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u8 (uint8x16_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u16 (uint16x8_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u32 (uint32x4_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u64 (uint64x2_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_p8 (poly8x16_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_p16 (poly16x8_t __a)
+{
+  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s8 (int8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s16 (int16x4_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s32 (int32x2_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s64 (int64x1_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_f32 (float32x2_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_u16 (uint16x4_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_u32 (uint32x2_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_u64 (uint64x1_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_p8 (poly8x8_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_p16 (poly16x4_t __a)
+{
+  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s8 (int8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s16 (int16x8_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s32 (int32x4_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s64 (int64x2_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f32 (float32x4_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_u16 (uint16x8_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_u32 (uint32x4_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_u64 (uint64x2_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_p8 (poly8x16_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_p16 (poly16x8_t __a)
+{
+  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s8 (int8x8_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s16 (int16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s32 (int32x2_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s64 (int64x1_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_f32 (float32x2_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_u8 (uint8x8_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_u32 (uint32x2_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_u64 (uint64x1_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_p8 (poly8x8_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_p16 (poly16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s8 (int8x16_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s16 (int16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s32 (int32x4_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s64 (int64x2_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f32 (float32x4_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_u8 (uint8x16_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_u32 (uint32x4_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_u64 (uint64x2_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_p8 (poly8x16_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_p16 (poly16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s8 (int8x8_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s16 (int16x4_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s32 (int32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv2si (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s64 (int64x1_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_f32 (float32x2_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_u8 (uint8x8_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_u16 (uint16x4_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_u64 (uint64x1_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_p8 (poly8x8_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_p16 (poly16x4_t __a)
+{
+  return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s8 (int8x16_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s16 (int16x8_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s32 (int32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv4si (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s64 (int64x2_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f32 (float32x4_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_u8 (uint8x16_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_u16 (uint16x8_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_u64 (uint64x2_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_p8 (poly8x16_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_p16 (poly16x8_t __a)
+{
+  return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S Wed Jul 22 15:36:27 2009
@@ -44,7 +44,10 @@
 ARM_FUNC_START aeabi_lcmp
 	subs	ip, xxl, yyl
 	sbcs	ip, xxh, yyh
-	subeqs  ip, xxl, yyl
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq
+	COND(sub,s,eq)  ip, xxl, yyl
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	mov	r0, ip
 	RET
 	FUNC_END aeabi_lcmp
@@ -55,12 +58,20 @@
 
 ARM_FUNC_START aeabi_ulcmp
 	cmp	xxh, yyh
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	lo
 	movlo	r0, #-1
+	do_it	hi
 	movhi	r0, #1
+	do_it	ne
 	RETc(ne)
 	cmp	xxl, yyl
+	do_it	lo
 	movlo	r0, #-1
+	do_it	hi
 	movhi	r0, #1
+	do_it	eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	moveq	r0, #0
 	RET
 	FUNC_END aeabi_ulcmp
@@ -71,11 +82,18 @@
 
 ARM_FUNC_START aeabi_ldivmod
 	sub sp, sp, #8
-	stmfd sp!, {sp, lr}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	mov ip, sp
+	push {ip, lr}
+#else
+	do_push {sp, lr}
+#endif
 	bl SYM(__gnu_ldivmod_helper) __PLT__
 	ldr lr, [sp, #4]
 	add sp, sp, #8
-	ldmfd sp!, {r2, r3}
+	do_pop {r2, r3}
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	RET
 	
 #endif /* L_aeabi_ldivmod */
@@ -84,11 +102,18 @@
 
 ARM_FUNC_START aeabi_uldivmod
 	sub sp, sp, #8
-	stmfd sp!, {sp, lr}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	mov ip, sp
+	push {ip, lr}
+#else
+	do_push {sp, lr}
+#endif
 	bl SYM(__gnu_uldivmod_helper) __PLT__
 	ldr lr, [sp, #4]
 	add sp, sp, #8
-	ldmfd sp!, {r2, r3}
+	do_pop {r2, r3}
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	RET
 	
 #endif /* L_aeabi_divmod */

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md Wed Jul 22 15:36:27 2009
@@ -34,7 +34,8 @@
   [(set (match_operand:DI          0 "cirrus_fp_register" "=v")
 	(plus:DI (match_operand:DI 1 "cirrus_fp_register"  "v")
 		 (match_operand:DI 2 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfadd64%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -44,7 +45,8 @@
   [(set (match_operand:SI          0 "cirrus_fp_register" "=v")
 	(plus:SI (match_operand:SI 1 "cirrus_fp_register" "v")
 		 (match_operand:SI 2 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfadd32%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -54,7 +56,8 @@
   [(set (match_operand:SF          0 "cirrus_fp_register" "=v")
 	(plus:SF (match_operand:SF 1 "cirrus_fp_register" "v")
 		 (match_operand:SF 2 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfadds%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -64,7 +67,8 @@
   [(set (match_operand:DF          0 "cirrus_fp_register" "=v")
 	(plus:DF (match_operand:DF 1 "cirrus_fp_register" "v")
 		 (match_operand:DF 2 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfaddd%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -74,7 +78,8 @@
   [(set (match_operand:DI           0 "cirrus_fp_register" "=v")
 	(minus:DI (match_operand:DI 1 "cirrus_fp_register"  "v")
 		  (match_operand:DI 2 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfsub64%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -84,7 +89,8 @@
   [(set (match_operand:SI           0 "cirrus_fp_register" "=v")
 	(minus:SI (match_operand:SI 1 "cirrus_fp_register" "v")
 		  (match_operand:SI 2 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfsub32%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -94,7 +100,8 @@
   [(set (match_operand:SF           0 "cirrus_fp_register" "=v")
 	(minus:SF (match_operand:SF 1 "cirrus_fp_register"  "v")
 		  (match_operand:SF 2 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfsubs%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -104,7 +111,8 @@
   [(set (match_operand:DF           0 "cirrus_fp_register" "=v")
 	(minus:DF (match_operand:DF 1 "cirrus_fp_register" "v")
 		  (match_operand:DF 2 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfsubd%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -114,7 +122,8 @@
   [(set (match_operand:SI          0 "cirrus_fp_register" "=v")
 	(mult:SI (match_operand:SI 2 "cirrus_fp_register"  "v")
 		 (match_operand:SI 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfmul32%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -125,7 +134,8 @@
   [(set (match_operand:DI          0 "cirrus_fp_register" "=v")
 	(mult:DI (match_operand:DI 2 "cirrus_fp_register"  "v")
 		 (match_operand:DI 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfmul64%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_dmult")
    (set_attr "cirrus" "normal")]
@@ -137,7 +147,8 @@
 	  (mult:SI (match_operand:SI 1 "cirrus_fp_register"  "v")
 		   (match_operand:SI 2 "cirrus_fp_register"  "v"))
 	  (match_operand:SI          3 "cirrus_fp_register"  "0")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfmac32%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -150,7 +161,8 @@
 	  (match_operand:SI          1 "cirrus_fp_register"  "0")
 	  (mult:SI (match_operand:SI 2 "cirrus_fp_register"  "v")
 		   (match_operand:SI 3 "cirrus_fp_register"  "v"))))]
-  "0 && TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "0 && TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfmsc32%?\\t%V0, %V2, %V3"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -160,7 +172,8 @@
   [(set (match_operand:SF          0 "cirrus_fp_register" "=v")
 	(mult:SF (match_operand:SF 1 "cirrus_fp_register"  "v")
 		 (match_operand:SF 2 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfmuls%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_farith")
    (set_attr "cirrus" "normal")]
@@ -170,7 +183,8 @@
   [(set (match_operand:DF          0 "cirrus_fp_register" "=v")
 	(mult:DF (match_operand:DF 1 "cirrus_fp_register"  "v")
 		 (match_operand:DF 2 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfmuld%?\\t%V0, %V1, %V2"
   [(set_attr "type" "mav_dmult")
    (set_attr "cirrus" "normal")]
@@ -180,7 +194,8 @@
   [(set (match_operand:SI            0 "cirrus_fp_register" "=v")
 	(ashift:SI (match_operand:SI 1 "cirrus_fp_register"  "v")
 		   (match_operand:SI 2 "cirrus_shift_const"  "")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfsh32%?\\t%V0, %V1, #%s2"
   [(set_attr "cirrus" "normal")]
 )
@@ -189,7 +204,8 @@
   [(set (match_operand:SI	       0 "cirrus_fp_register" "=v")
 	(ashiftrt:SI (match_operand:SI 1 "cirrus_fp_register"  "v")
 		     (match_operand:SI 2 "cirrus_shift_const"  "")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfsh32%?\\t%V0, %V1, #-%s2"
   [(set_attr "cirrus" "normal")]
 )
@@ -198,7 +214,8 @@
   [(set (match_operand:SI            0 "cirrus_fp_register" "=v")
 	(ashift:SI (match_operand:SI 1 "cirrus_fp_register"  "v")
 		   (match_operand:SI 2 "register_operand"    "r")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfrshl32%?\\t%V1, %V0, %s2"
   [(set_attr "cirrus" "normal")]
 )
@@ -207,7 +224,8 @@
   [(set (match_operand:DI            0 "cirrus_fp_register" "=v")
 	(ashift:DI (match_operand:DI 1 "cirrus_fp_register"  "v")
 		   (match_operand:SI 2 "register_operand"    "r")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfrshl64%?\\t%V1, %V0, %s2"
   [(set_attr "cirrus" "normal")]
 )
@@ -216,7 +234,8 @@
   [(set (match_operand:DI            0 "cirrus_fp_register" "=v")
 	(ashift:DI (match_operand:DI 1 "cirrus_fp_register"  "v")
 		   (match_operand:SI 2 "cirrus_shift_const"  "")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfsh64%?\\t%V0, %V1, #%s2"
   [(set_attr "cirrus" "normal")]
 )
@@ -225,7 +244,8 @@
   [(set (match_operand:DI            0 "cirrus_fp_register" "=v")
 	(ashiftrt:DI (match_operand:DI 1 "cirrus_fp_register"  "v")
 		     (match_operand:SI 2 "cirrus_shift_const"  "")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfsh64%?\\t%V0, %V1, #-%s2"
   [(set_attr "cirrus" "normal")]
 )
@@ -233,7 +253,8 @@
 (define_insn "*cirrus_absdi2"
   [(set (match_operand:DI         0 "cirrus_fp_register" "=v")
 	(abs:DI (match_operand:DI 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfabs64%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -243,7 +264,8 @@
   [(set (match_operand:DI         0 "cirrus_fp_register" "=v")
 	(neg:DI (match_operand:DI 1 "cirrus_fp_register"  "v")))
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfneg64%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -251,7 +273,8 @@
 (define_insn "*cirrus_negsi2"
   [(set (match_operand:SI         0 "cirrus_fp_register" "=v")
 	(neg:SI (match_operand:SI 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfneg32%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -259,7 +282,8 @@
 (define_insn "*cirrus_negsf2"
   [(set (match_operand:SF         0 "cirrus_fp_register" "=v")
 	(neg:SF (match_operand:SF 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfnegs%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -267,7 +291,8 @@
 (define_insn "*cirrus_negdf2"
   [(set (match_operand:DF         0 "cirrus_fp_register" "=v")
 	(neg:DF (match_operand:DF 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfnegd%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -277,7 +302,8 @@
   [(set (match_operand:SI         0 "cirrus_fp_register" "=v")
         (abs:SI (match_operand:SI 1 "cirrus_fp_register"  "v")))
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
   "cfabs32%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -285,7 +311,8 @@
 (define_insn "*cirrus_abssf2"
   [(set (match_operand:SF         0 "cirrus_fp_register" "=v")
         (abs:SF (match_operand:SF 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfabss%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -293,7 +320,8 @@
 (define_insn "*cirrus_absdf2"
   [(set (match_operand:DF         0 "cirrus_fp_register" "=v")
         (abs:DF (match_operand:DF 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfabsd%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -303,7 +331,8 @@
   [(set (match_operand:SF           0 "cirrus_fp_register" "=v")
  	(float:SF (match_operand:SI 1 "s_register_operand"  "r")))
    (clobber (match_scratch:DF 2 "=v"))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfmv64lr%?\\t%Z2, %1\;cfcvt32s%?\\t%V0, %Y2"
   [(set_attr "length" "8")
    (set_attr "cirrus" "move")]
@@ -313,7 +342,8 @@
   [(set (match_operand:DF           0 "cirrus_fp_register" "=v")
 	(float:DF (match_operand:SI 1 "s_register_operand" "r")))
    (clobber (match_scratch:DF 2 "=v"))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfmv64lr%?\\t%Z2, %1\;cfcvt32d%?\\t%V0, %Y2"
   [(set_attr "length" "8")
    (set_attr "cirrus" "move")]
@@ -322,14 +352,16 @@
 (define_insn "floatdisf2"
   [(set (match_operand:SF           0 "cirrus_fp_register" "=v")
 	(float:SF (match_operand:DI 1 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfcvt64s%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")])
 
 (define_insn "floatdidf2"
   [(set (match_operand:DF 0 "cirrus_fp_register" "=v")
 	(float:DF (match_operand:DI 1 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfcvt64d%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")])
 
@@ -337,7 +369,8 @@
   [(set (match_operand:SI         0 "s_register_operand" "=r")
 	(fix:SI (fix:SF (match_operand:SF 1 "cirrus_fp_register"  "v"))))
    (clobber (match_scratch:DF     2                      "=v"))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cftruncs32%?\\t%Y2, %V1\;cfmvr64l%?\\t%0, %Z2"
   [(set_attr "length" "8")
    (set_attr "cirrus" "normal")]
@@ -347,7 +380,8 @@
   [(set (match_operand:SI         0 "s_register_operand" "=r")
 	(fix:SI (fix:DF (match_operand:DF 1 "cirrus_fp_register"  "v"))))
    (clobber (match_scratch:DF     2                      "=v"))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cftruncd32%?\\t%Y2, %V1\;cfmvr64l%?\\t%0, %Z2"
   [(set_attr "length" "8")]
 )
@@ -356,7 +390,8 @@
   [(set (match_operand:SF  0 "cirrus_fp_register" "=v")
         (float_truncate:SF
          (match_operand:DF 1 "cirrus_fp_register" "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfcvtds%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -364,7 +399,8 @@
 (define_insn "*cirrus_extendsfdf2"
   [(set (match_operand:DF                  0 "cirrus_fp_register" "=v")
         (float_extend:DF (match_operand:SF 1 "cirrus_fp_register"  "v")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
   "cfcvtsd%?\\t%V0, %V1"
   [(set_attr "cirrus" "normal")]
 )
@@ -457,3 +493,112 @@
    (set_attr "cirrus"         " not,   not,not,   not, not,normal,double,move,normal,double")]
 )
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_insn "*cirrus_thumb2_movdi"
+  [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,o<>,v,r,v,m,v")
+	(match_operand:DI 1 "di_operand"              "rIK,mi,r,r,v,mi,v,v"))]
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+  "*
+  {
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+    case 2:
+      return (output_move_double (operands));
+
+    case 3: return \"cfmv64lr%?\\t%V0, %Q1\;cfmv64hr%?\\t%V0, %R1\";
+    case 4: return \"cfmvr64l%?\\t%Q0, %V1\;cfmvr64h%?\\t%R0, %V1\";
+
+    case 5: return \"cfldr64%?\\t%V0, %1\";
+    case 6: return \"cfstr64%?\\t%V1, %0\";
+
+    /* Shifting by 0 will just copy %1 into %0.  */
+    case 7: return \"cfsh64%?\\t%V0, %V1, #0\";
+
+    default: abort ();
+    }
+  }"
+  [(set_attr "length"         "  8,   8,     8,   8,     8,     4,     4,     4")
+   (set_attr "type"           "  *,load2,store2,   *,     *,  load2,store2,     *")
+   (set_attr "pool_range"     "  *,4096,     *,   *,     *,  1020,     *,     *")
+   (set_attr "neg_pool_range" "  *,   0,     *,   *,     *,  1008,     *,     *")
+   (set_attr "cirrus"         "not, not,   not,move,normal,double,double,normal")]
+)
+
+;; Cirrus SI values have been outlawed.  Look in arm.h for the comment
+;; on HARD_REGNO_MODE_OK.
+
+(define_insn "*cirrus_thumb2_movsi_insn"
+  [(set (match_operand:SI 0 "general_operand" "=r,r,r,m,*v,r,*v,T,*v")
+        (match_operand:SI 1 "general_operand" "rI,K,mi,r,r,*v,T,*v,*v"))]
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0
+   && (register_operand (operands[0], SImode)
+       || register_operand (operands[1], SImode))"
+  "@
+   mov%?\\t%0, %1
+   mvn%?\\t%0, #%B1
+   ldr%?\\t%0, %1
+   str%?\\t%1, %0
+   cfmv64lr%?\\t%Z0, %1
+   cfmvr64l%?\\t%0, %Z1
+   cfldr32%?\\t%V0, %1
+   cfstr32%?\\t%V1, %0
+   cfsh32%?\\t%V0, %V1, #0"
+  [(set_attr "type"           "*,  *,  load1,store1,   *,     *,  load1,store1,     *")
+   (set_attr "pool_range"     "*,  *,  4096,     *,   *,     *,  1024,     *,     *")
+   (set_attr "neg_pool_range" "*,  *,     0,     *,   *,     *,  1012,     *,     *")
+   (set_attr "cirrus"         "not,not, not,   not,move,normal,normal,normal,normal")]
+)
+
+(define_insn "*thumb2_cirrus_movsf_hard_insn"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=v,v,v,r,m,r,r,m")
+        (match_operand:SF 1 "general_operand"      "v,mE,r,v,v,r,mE,r"))]
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK
+   && (GET_CODE (operands[0]) != MEM
+       || register_operand (operands[1], SFmode))"
+  "@
+   cfcpys%?\\t%V0, %V1
+   cfldrs%?\\t%V0, %1
+   cfmvsr%?\\t%V0, %1
+   cfmvrs%?\\t%0, %V1
+   cfstrs%?\\t%V1, %0
+   mov%?\\t%0, %1
+   ldr%?\\t%0, %1\\t%@ float
+   str%?\\t%1, %0\\t%@ float"
+  [(set_attr "length"         "     *,     *,   *,     *,     *,  4,   4,     4")
+   (set_attr "type"           "     *,  load1,   *,     *,store1,  *,load1,store1")
+   (set_attr "pool_range"     "     *,   1020,   *,     *,     *,  *,4096,     *")
+   (set_attr "neg_pool_range" "     *,   1008,   *,     *,     *,  *,   0,     *")
+   (set_attr "cirrus"         "normal,normal,move,normal,normal,not, not,   not")]
+)
+
+(define_insn "*thumb2_cirrus_movdf_hard_insn"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=r,Q,r,m,r,v,v,v,r,m")
+	(match_operand:DF 1 "general_operand"       "Q,r,r,r,mF,v,mF,r,v,v"))]
+  "TARGET_THUMB2
+   && TARGET_HARD_FLOAT && TARGET_MAVERICK
+   && (GET_CODE (operands[0]) != MEM
+       || register_operand (operands[1], DFmode))"
+  "*
+  {
+  switch (which_alternative)
+    {
+    case 0: return \"ldm%?ia\\t%m1, %M0\\t%@ double\";
+    case 1: return \"stm%?ia\\t%m0, %M1\\t%@ double\";
+    case 2: case 3: case 4: return output_move_double (operands);
+    case 5: return \"cfcpyd%?\\t%V0, %V1\";
+    case 6: return \"cfldrd%?\\t%V0, %1\";
+    case 7: return \"cfmvdlr\\t%V0, %Q1\;cfmvdhr%?\\t%V0, %R1\";
+    case 8: return \"cfmvrdl%?\\t%Q0, %V1\;cfmvrdh%?\\t%R0, %V1\";
+    case 9: return \"cfstrd%?\\t%V1, %0\";
+    default: abort ();
+    }
+  }"
+  [(set_attr "type"           "load1,store2,  *,store2,load1,     *,  load1,   *,     *,store2")
+   (set_attr "length"         "   4,     4,  8,     8,   8,     4,     4,   8,     8,     4")
+   (set_attr "pool_range"     "   *,     *,  *,     *,4092,     *,  1020,   *,     *,     *")
+   (set_attr "neg_pool_range" "   *,     *,  *,     *,   0,     *,  1008,   *,     *,     *")
+   (set_attr "cirrus"         " not,   not,not,   not, not,normal,double,move,normal,double")]
+)
+;; APPLE LOCAL end v7 support. Merge from Codesourcery

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/coff.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/coff.h?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/coff.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/coff.h Wed Jul 22 15:36:27 2009
@@ -59,9 +59,12 @@
 /* Define this macro if jump tables (for `tablejump' insns) should be
    output in the text section, along with the assembler instructions.
    Otherwise, the readonly data section is used.  */
-/* We put ARM jump tables in the text section, because it makes the code
-   more efficient, but for Thumb it's better to put them out of band.  */
-#define JUMP_TABLES_IN_TEXT_SECTION (TARGET_ARM)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* We put ARM and Thumb-2 jump tables in the text section, because it makes
+   the code more efficient, but for Thumb-1 it's better to put them out of
+   band.  */
+#define JUMP_TABLES_IN_TEXT_SECTION (TARGET_32BIT)
+/* APPLE LOCAL end v7 support. Merge from mainline */
 
 #undef  READONLY_DATA_SECTION_ASM_OP
 #define READONLY_DATA_SECTION_ASM_OP	"\t.section .rdata"

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md Wed Jul 22 15:36:27 2009
@@ -19,33 +19,49 @@
 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 ;; Boston, MA 02110-1301, USA.
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
 ;; The following register constraints have been used:
-;; - in ARM state: f, v, w, y, z
+;; - in ARM/Thumb-2 state: f, t, v, w, x, y, z
 ;; - in Thumb state: h, k, b
 ;; - in both states: l, c
 ;; In ARM state, 'l' is an alias for 'r'
 
 ;; The following normal constraints have been used:
-;; in ARM state: G, H, I, J, K, L, M
-;; in Thumb state: I, J, K, L, M, N, O
+;; in ARM/Thumb-2 state: G, H, I, J, K, L, M
+;; in Thumb-1 state: I, J, K, L, M, N, O
+;; APPLE LOCAL end v7 support. Merge from mainline
 
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
 ;; The following multi-letter normal constraints have been used:
 ;; APPLE LOCAL 5831562 long long constants
-;; in ARM state: Da, Db, Dc, Dd
+;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, Dl, DL, Dv
 
 ;; The following memory constraints have been used:
-;; in ARM state: Q, Uq, Uv, Uy
+;; in ARM/Thumb-2 state: Q, Ut, Uv, Uy, Un, Us
+;; in ARM state: Uq
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 
 
 (define_register_constraint "f" "TARGET_ARM ? FPA_REGS : NO_REGS"
  "Legacy FPA registers @code{f0}- at code{f7}.")
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS"
+ "The VFP registers @code{s0}- at code{s31}.")
+
+;; APPLE LOCAL end v7 support. Merge from mainline
 (define_register_constraint "v" "TARGET_ARM ? CIRRUS_REGS : NO_REGS"
  "The Cirrus Maverick co-processor registers.")
 
-(define_register_constraint "w" "TARGET_ARM ? VFP_REGS : NO_REGS"
- "The VFP registers @code{s0}- at code{s31}.")
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_register_constraint "w"
+  "TARGET_32BIT ? (TARGET_VFP3 ? VFP_REGS : VFP_LO_REGS) : NO_REGS"
+ "The VFP registers @code{d0}- at code{d15}, or @code{d0}- at code{d31} for VFPv3.")
+
+(define_register_constraint "x" "TARGET_32BIT ? VFP_D0_D7_REGS : NO_REGS"
+ "The VFP registers @code{d0}- at code{d7}.")
 
+;; APPLE LOCAL end v7 support. Merge from mainline
 (define_register_constraint "y" "TARGET_REALLY_IWMMXT ? IWMMXT_REGS : NO_REGS"
  "The Intel iWMMX co-processor registers.")
 
@@ -70,109 +86,176 @@
 (define_register_constraint "c" "CC_REG"
  "@internal The condition code register.")
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
 (define_constraint "I"
- "In ARM state a constant that can be used as an immediate value in a Data
-  Processing instruction.  In Thumb state a constant in the range 0-255."
+ "In ARM/Thumb-2 state a constant that can be used as an immediate value in a
+  Data Processing instruction.  In Thumb-1 state a constant in the range
+  0-255."
  (and (match_code "const_int")
-      (match_test "TARGET_ARM ? const_ok_for_arm (ival)
+      (match_test "TARGET_32BIT ? const_ok_for_arm (ival)
 		   : ival >= 0 && ival <= 255")))
 
 (define_constraint "J"
- "In ARM state a constant in the range @minus{}4095-4095.  In Thumb state
-  a constant in the range @minus{}255- at minus{}1."
+ "In ARM/Thumb-2 state a constant in the range @minus{}4095-4095.  In Thumb-1
+  state a constant in the range @minus{}255- at minus{}1."
  (and (match_code "const_int")
-      (match_test "TARGET_ARM ? (ival >= -4095 && ival <= 4095)
+      (match_test "TARGET_32BIT ? (ival >= -4095 && ival <= 4095)
 		   : (ival >= -255 && ival <= -1)")))
 
 (define_constraint "K"
- "In ARM state a constant that satisfies the @code{I} constraint if inverted.
-  In Thumb state a constant that satisfies the @code{I} constraint multiplied 
-  by any power of 2."
+ "In ARM/Thumb-2 state a constant that satisfies the @code{I} constraint if
+  inverted.  In Thumb-1 state a constant that satisfies the @code{I}
+  constraint multiplied by any power of 2."
  (and (match_code "const_int")
-      (match_test "TARGET_ARM ? const_ok_for_arm (~ival)
+      (match_test "TARGET_32BIT ? const_ok_for_arm (~ival)
 		   : thumb_shiftable_const (ival)")))
 
 (define_constraint "L"
- "In ARM state a constant that satisfies the @code{I} constraint if negated.
-  In Thumb state a constant in the range @minus{}7-7."
+ "In ARM/Thumb-2 state a constant that satisfies the @code{I} constraint if
+  negated.  In Thumb-1 state a constant in the range @minus{}7-7."
  (and (match_code "const_int")
-      (match_test "TARGET_ARM ? const_ok_for_arm (-ival)
+      (match_test "TARGET_32BIT ? const_ok_for_arm (-ival)
 		   : (ival >= -7 && ival <= 7)")))
 
 ;; The ARM state version is internal...
-;; @internal In ARM state a constant in the range 0-32 or any power of 2.
+;; @internal In ARM/Thumb-2 state a constant in the range 0-32 or any
+;; power of 2.
 (define_constraint "M"
- "In Thumb state a constant that is a multiple of 4 in the range 0-1020."
+ "In Thumb-1 state a constant that is a multiple of 4 in the range 0-1020."
  (and (match_code "const_int")
-      (match_test "TARGET_ARM ? ((ival >= 0 && ival <= 32)
+      (match_test "TARGET_32BIT ? ((ival >= 0 && ival <= 32)
 				 || ((ival & (ival - 1)) == 0))
 		   : ((ival >= 0 && ival <= 1020) && ((ival & 3) == 0))")))
 
 (define_constraint "N"
- "In Thumb state a constant in the range 0-31."
+ "In ARM/Thumb-2 state a constant suitable for a MOVW instruction.
+  In Thumb-1 state a constant in the range 0-31."
  (and (match_code "const_int")
-      (match_test "TARGET_THUMB && ival >= 0 && ival <= 31")))
+      (match_test "TARGET_32BIT ? arm_arch_thumb2 && ((ival & 0xffff0000) == 0)
+				: (ival >= 0 && ival <= 31)")))
 
 (define_constraint "O"
- "In Thumb state a constant that is a multiple of 4 in the range
+ "In Thumb-1 state a constant that is a multiple of 4 in the range
   @minus{}508-508."
  (and (match_code "const_int")
-      (match_test "TARGET_THUMB && ival >= -508 && ival <= 508
+      (match_test "TARGET_THUMB1 && ival >= -508 && ival <= 508
 		   && ((ival & 3) == 0)")))
 
 (define_constraint "G"
- "In ARM state a valid FPA immediate constant."
+ "In ARM/Thumb-2 state a valid FPA immediate constant."
  (and (match_code "const_double")
-      (match_test "TARGET_ARM && arm_const_double_rtx (op)")))
+      (match_test "TARGET_32BIT && arm_const_double_rtx (op)")))
 
 (define_constraint "H"
- "In ARM state a valid FPA immediate constant when negated."
+ "In ARM/Thumb-2 state a valid FPA immediate constant when negated."
  (and (match_code "const_double")
-      (match_test "TARGET_ARM && neg_const_double_rtx_ok_for_fpa (op)")))
+      (match_test "TARGET_32BIT && neg_const_double_rtx_ok_for_fpa (op)")))
 
 (define_constraint "Da"
  "@internal
-  In ARM state a const_int, const_double or const_vector that can
+  In ARM/Thumb-2 state a const_int, const_double or const_vector that can
   be generated with two Data Processing insns."
  (and (match_code "const_double,const_int,const_vector")
-      (match_test "TARGET_ARM && arm_const_double_inline_cost (op) == 2")))
+      (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 2")))
 
 (define_constraint "Db"
  "@internal
-  In ARM state a const_int, const_double or const_vector that can
+  In ARM/Thumb-2 state a const_int, const_double or const_vector that can
   be generated with three Data Processing insns."
  (and (match_code "const_double,const_int,const_vector")
-      (match_test "TARGET_ARM && arm_const_double_inline_cost (op) == 3")))
+      (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 3")))
 
 (define_constraint "Dc"
  "@internal
-  In ARM state a const_int, const_double or const_vector that can
+  In ARM/Thumb-2 state a const_int, const_double or const_vector that can
   be generated with four Data Processing insns.  This pattern is disabled
   if optimizing for space or when we have load-delay slots to fill."
  (and (match_code "const_double,const_int,const_vector")
-      (match_test "TARGET_ARM && arm_const_double_inline_cost (op) == 4
+      (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 4
 		   && !(optimize_size || arm_ld_sched)")))
-
 ;; APPLE LOCAL begin 5831562 long long constants
 (define_constraint "Dd"
  "@internal
   In ARM state a const_int, const_double or const_vector that can
   used directly in arithmetic instructions as two 32-bit immediates."
  (and (match_code "const_double,const_int,const_vector")
-      (match_test "TARGET_ARM && const64_ok_for_arm_immediate (op)")))
+      (match_test "TARGET_32BIT && const64_ok_for_arm_immediate (op)")))
 ;; APPLE LOCAL end 5831562 long long constants
+;; APPLE LOCAL end v7 support. Merge from mainline
+
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_constraint "Dn"
+ "@internal
+  In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov
+  immediate instruction."
+ (and (match_code "const_vector")
+      (match_test "TARGET_32BIT
+		   && imm_for_neon_mov_operand (op, GET_MODE (op))")))
+
+(define_constraint "Dl"
+ "@internal
+  In ARM/Thumb-2 state a const_vector which can be used with a Neon vorr or
+  vbic instruction."
+ (and (match_code "const_vector")
+      (match_test "TARGET_32BIT
+		   && imm_for_neon_logic_operand (op, GET_MODE (op))")))
+
+(define_constraint "DL"
+ "@internal
+  In ARM/Thumb-2 state a const_vector which can be used with a Neon vorn or
+  vand instruction."
+ (and (match_code "const_vector")
+      (match_test "TARGET_32BIT
+		   && imm_for_neon_inv_logic_operand (op, GET_MODE (op))")))
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
+;; APPLE LOCAL begin v7 support. Merge from mainline
+
+(define_constraint "Dv"
+ "@internal
+  In ARM/Thumb-2 state a const_double which can be used with a VFP fconsts
+  or fconstd instruction."
+ (and (match_code "const_double")
+      (match_test "TARGET_32BIT && vfp3_const_double_rtx (op)")))
+;; APPLE LOCAL end v7 support. Merge from mainline
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+
+(define_memory_constraint "Ut"
+ "@internal
+  In ARM/Thumb-2 state an address valid for loading/storing opaque structure
+  types wider than TImode."
+ (and (match_code "mem")
+      (match_test "TARGET_32BIT && neon_struct_mem_operand (op)")))
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
 (define_memory_constraint "Uv"
  "@internal
-  In ARM state a valid VFP load/store address."
+  In ARM/Thumb-2 state a valid VFP load/store address."
  (and (match_code "mem")
-      (match_test "TARGET_ARM && arm_coproc_mem_operand (op, FALSE)")))
+      (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, FALSE)")))
 
 (define_memory_constraint "Uy"
  "@internal
-  In ARM state a valid iWMMX load/store address."
+  In ARM/Thumb-2 state a valid iWMMX load/store address."
+ (and (match_code "mem")
+      (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, TRUE)")))
+;; APPLE LOCAL end v7 support. Merge from mainline
+
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_memory_constraint "Un"
+ "@internal
+  In ARM/Thumb-2 state a valid address for Neon element and structure
+  load/store instructions."
+ (and (match_code "mem")
+      (match_test "TARGET_32BIT && neon_vector_mem_operand (op, FALSE)")))
+
+(define_memory_constraint "Us"
+ "@internal
+  In ARM/Thumb-2 state a valid address for non-offset loads/stores of
+  quad-word values in four ARM registers."
  (and (match_code "mem")
-      (match_test "TARGET_ARM && arm_coproc_mem_operand (op, TRUE)")))
+      (match_test "TARGET_32BIT && neon_vector_mem_operand (op, TRUE)")))
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 
 (define_memory_constraint "Uq"
  "@internal
@@ -182,11 +265,13 @@
 		   && arm_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
 						SIGN_EXTEND, 0)")))
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
 (define_memory_constraint "Q"
  "@internal
-  In ARM state an address that is a single base register."
+  In ARM/Thumb-2 state an address that is a single base register."
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
+;; APPLE LOCAL end v7 support. Merge from mainline
 
 ;; We used to have constraint letters for S and R in ARM state, but
 ;; all uses of these now appear to have been removed.

Added: llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,1308 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM Cortex-A8 NEON scheduling description.
+;; Copyright (C) 2007 Free Software Foundation, Inc.
+;; Contributed by CodeSourcery.
+
+;; This file is part of GCC.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_automaton "cortex_a8_neon")
+
+;; Only one load, store, permute, MCR or MRC instruction can be issued
+;; per cycle.
+(define_cpu_unit "cortex_a8_neon_issue_perm" "cortex_a8_neon")
+
+;; Only one data-processing instruction can be issued per cycle.
+(define_cpu_unit "cortex_a8_neon_issue_dp" "cortex_a8_neon")
+
+;; The VFPLite unit (non-pipelined).
+(define_cpu_unit "cortex_a8_vfplite" "cortex_a8_neon")
+
+;; We need a special mutual exclusion (to be used in addition to
+;; cortex_a8_neon_issue_dp) for the case when an instruction such as
+;; vmla.f is forwarded from E5 of the floating-point multiply pipeline to
+;; E2 of the floating-point add pipeline.  On the cycle previous to that
+;; forward we must prevent issue of any instruction to the floating-point
+;; add pipeline, but still allow issue of a data-processing instruction
+;; to any of the other pipelines.
+(define_cpu_unit "cortex_a8_neon_issue_fadd" "cortex_a8_neon")
+
+;; Patterns of reservation.
+;; We model the NEON issue units as running in parallel with the core ones.
+;; We assume that multi-cycle NEON instructions get decomposed into
+;; micro-ops as they are issued into the NEON pipeline, and not as they
+;; are issued into the ARM pipeline.  Dual issue may not occur except
+;; upon the first and last cycles of a multi-cycle instruction, but it
+;; is unclear whether two multi-cycle instructions can issue together (in
+;; this model they cannot).  It is also unclear whether a pair of
+;; a multi-cycle and single-cycle instructions, that could potentially
+;; issue together, only do so if (say) the single-cycle one precedes
+;; the other.
+
+(define_reservation "cortex_a8_neon_dp"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp")
+(define_reservation "cortex_a8_neon_dp_2"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+                     cortex_a8_neon_issue_dp")
+(define_reservation "cortex_a8_neon_dp_4"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp")
+
+(define_reservation "cortex_a8_neon_fadd"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\
+                     cortex_a8_neon_issue_fadd")
+(define_reservation "cortex_a8_neon_fadd_2"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\
+                     cortex_a8_neon_issue_fadd,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_fadd")
+
+(define_reservation "cortex_a8_neon_perm"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+\
+                     cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_perm_2"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+\
+                     cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_perm_3"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+\
+                     cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_perm")
+
+(define_reservation "cortex_a8_neon_ls"
+                    "cortex_a8_issue_ls+cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_2"
+                    "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_3"
+                    "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_4"
+                    "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_5"
+                    "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+                     cortex_a8_neon_issue_perm")
+
+(define_reservation "cortex_a8_neon_fmul_then_fadd"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+		     nothing*3,\
+		     cortex_a8_neon_issue_fadd")
+(define_reservation "cortex_a8_neon_fmul_then_fadd_2"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+		     cortex_a8_neon_issue_dp,\
+		     nothing*2,\
+		     cortex_a8_neon_issue_fadd,\
+		     cortex_a8_neon_issue_fadd")
+
+;; VFP instructions can only be single-issued into the NEON pipeline.
+(define_reservation "cortex_a8_vfp"
+                    "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\
+                     cortex_a8_neon_issue_perm+cortex_a8_vfplite")
+
+;; VFP instructions.
+;; The VFPLite unit that executes these isn't pipelined; we give the
+;; worst-case latencies (and choose the double-precision ones where we
+;; do not distinguish on precision).  We assume RunFast mode is not
+;; enabled and therefore do not model the possible VFP instruction
+;; execution in the NEON floating point pipelines, nor additional
+;; latencies for the processing of subnormals.
+;;
+;; TODO: RunFast mode could potentially be enabled when -ffast-math
+;; is specified.
+
+(define_insn_reservation "cortex_a8_vfp_add_sub" 10
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "farith"))
+  "cortex_a8_vfp,cortex_a8_vfplite*9")
+
+(define_insn_reservation "cortex_a8_vfp_muls" 12
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "fmuls"))
+  "cortex_a8_vfp,cortex_a8_vfplite*11")
+
+(define_insn_reservation "cortex_a8_vfp_muld" 17
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "fmuld"))
+  "cortex_a8_vfp,cortex_a8_vfplite*16")
+
+(define_insn_reservation "cortex_a8_vfp_macs" 21
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "fmacs"))
+  "cortex_a8_vfp,cortex_a8_vfplite*20")
+
+(define_insn_reservation "cortex_a8_vfp_macd" 26
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "fmacd"))
+  "cortex_a8_vfp,cortex_a8_vfplite*25")
+
+(define_insn_reservation "cortex_a8_vfp_divs" 37
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "fdivs"))
+  "cortex_a8_vfp,cortex_a8_vfplite*36")
+
+(define_insn_reservation "cortex_a8_vfp_divd" 65
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "fdivd"))
+  "cortex_a8_vfp,cortex_a8_vfplite*64")
+
+;; Comparisons can actually take 7 cycles sometimes instead of four,
+;; but given all the other instructions lumped into type=ffarith that
+;; take four cycles, we pick that latency.
+(define_insn_reservation "cortex_a8_vfp_farith" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "ffarith"))
+  "cortex_a8_vfp,cortex_a8_vfplite*3")
+
+(define_insn_reservation "cortex_a8_vfp_cvt" 7
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "f_cvt"))
+  "cortex_a8_vfp,cortex_a8_vfplite*6")
+
+;; NEON -> core transfers.
+
+(define_insn_reservation "neon_mrc" 20
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mrc"))
+  "cortex_a8_neon_ls")
+
+(define_insn_reservation "neon_mrrc" 21
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mrrc"))
+  "cortex_a8_neon_ls_2")
+
+;; The remainder of this file is auto-generated by neon-schedgen.
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N3.
+(define_insn_reservation "neon_int_1" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_int_1"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)n operands at N2, and produce a result at N3.
+(define_insn_reservation "neon_int_2" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_int_2"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N3.
+(define_insn_reservation "neon_int_3" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_int_3"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N4.
+(define_insn_reservation "neon_int_4" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_int_4"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)n operands at N2, and produce a result at N4.
+(define_insn_reservation "neon_int_5" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_int_5"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N4.
+(define_insn_reservation "neon_vqneg_vqabs" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vqneg_vqabs"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation produce a result at N3.
+(define_insn_reservation "neon_vmov" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vmov"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6.
+(define_insn_reservation "neon_vaba" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vaba"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_vaba_qqq" 7
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vaba_qqq"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)d operands at N3, and produce a result at N6.
+(define_insn_reservation "neon_vsma" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vsma"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N6.
+(define_insn_reservation "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mul_qqq_8_16_32_ddd_32" 7
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mul_qqq_8_16_32_ddd_32"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar" 7
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N6.
+(define_insn_reservation "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mla_qqq_8_16" 7
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mla_qqq_8_16"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long" 7
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 4.
+(define_insn_reservation "neon_mla_qqq_32_qqd_32_scalar" 9
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mla_qqq_32_qqd_32_scalar"))
+  "cortex_a8_neon_dp_4")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N6.
+(define_insn_reservation "neon_mul_ddd_16_scalar_32_16_long_scalar" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mul_ddd_16_scalar_32_16_long_scalar"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 4.
+(define_insn_reservation "neon_mul_qqd_32_scalar" 9
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mul_qqd_32_scalar"))
+  "cortex_a8_neon_dp_4")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6.
+(define_insn_reservation "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N3.
+(define_insn_reservation "neon_shift_1" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_shift_1"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N4.
+(define_insn_reservation "neon_shift_2" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_shift_2"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N3 on cycle 2.
+(define_insn_reservation "neon_shift_3" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_shift_3"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N1.
+(define_insn_reservation "neon_vshl_ddd" 1
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vshl_ddd"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N4 on cycle 2.
+(define_insn_reservation "neon_vqshl_vrshl_vqrshl_qqq" 5
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vqshl_vrshl_vqrshl_qqq"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)d operands at N3, and produce a result at N6.
+(define_insn_reservation "neon_vsra_vrsra" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vsra_vrsra"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N5.
+(define_insn_reservation "neon_fp_vadd_ddd_vabs_dd" 5
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd"))
+  "cortex_a8_neon_fadd")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N5 on cycle 2.
+(define_insn_reservation "neon_fp_vadd_qqq_vabs_qq" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vadd_qqq_vabs_qq"))
+  "cortex_a8_neon_fadd_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N5.
+(define_insn_reservation "neon_fp_vsum" 5
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vsum"))
+  "cortex_a8_neon_fadd")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N5.
+(define_insn_reservation "neon_fp_vmul_ddd" 5
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vmul_ddd"))
+  "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N5 on cycle 2.
+(define_insn_reservation "neon_fp_vmul_qqd" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vmul_qqd"))
+  "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N9.
+(define_insn_reservation "neon_fp_vmla_ddd" 9
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vmla_ddd"))
+  "cortex_a8_neon_fmul_then_fadd")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N9 on cycle 2.
+(define_insn_reservation "neon_fp_vmla_qqq" 10
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vmla_qqq"))
+  "cortex_a8_neon_fmul_then_fadd_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N9.
+(define_insn_reservation "neon_fp_vmla_ddd_scalar" 9
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vmla_ddd_scalar"))
+  "cortex_a8_neon_fmul_then_fadd")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N9 on cycle 2.
+(define_insn_reservation "neon_fp_vmla_qqq_scalar" 10
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vmla_qqq_scalar"))
+  "cortex_a8_neon_fmul_then_fadd_2")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N9.
+(define_insn_reservation "neon_fp_vrecps_vrsqrts_ddd" 9
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_ddd"))
+  "cortex_a8_neon_fmul_then_fadd")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N9 on cycle 2.
+(define_insn_reservation "neon_fp_vrecps_vrsqrts_qqq" 10
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_qqq"))
+  "cortex_a8_neon_fmul_then_fadd_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2.
+(define_insn_reservation "neon_bp_simple" 2
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_bp_simple"))
+  "cortex_a8_neon_perm")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 2.
+(define_insn_reservation "neon_bp_2cycle" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_bp_2cycle"))
+  "cortex_a8_neon_perm_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 3.
+(define_insn_reservation "neon_bp_3cycle" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_bp_3cycle"))
+  "cortex_a8_neon_perm_3")
+
+;; Instructions using this reservation produce a result at N1.
+(define_insn_reservation "neon_ldr" 1
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_ldr"))
+  "cortex_a8_neon_ls")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_str" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_str"))
+  "cortex_a8_neon_ls")
+
+;; Instructions using this reservation produce a result at N1 on cycle 2.
+(define_insn_reservation "neon_vld1_1_2_regs" 2
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld1_1_2_regs"))
+  "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation produce a result at N1 on cycle 3.
+(define_insn_reservation "neon_vld1_3_4_regs" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld1_3_4_regs"))
+  "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2 on cycle 2.
+(define_insn_reservation "neon_vld2_2_regs_vld1_vld2_all_lanes" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld2_2_regs_vld1_vld2_all_lanes"))
+  "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation produce a result at N2 on cycle 3.
+(define_insn_reservation "neon_vld2_4_regs" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld2_4_regs"))
+  "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2 on cycle 4.
+(define_insn_reservation "neon_vld3_vld4" 5
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld3_vld4"))
+  "cortex_a8_neon_ls_4")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst1_1_2_regs_vst2_2_regs" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs"))
+  "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst1_3_4_regs" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vst1_3_4_regs"))
+  "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst2_4_regs_vst3_vst4" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vst2_4_regs_vst3_vst4"))
+  "cortex_a8_neon_ls_4")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst3_vst4" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vst3_vst4"))
+  "cortex_a8_neon_ls_4")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 3.
+(define_insn_reservation "neon_vld1_vld2_lane" 4
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld1_vld2_lane"))
+  "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 5.
+(define_insn_reservation "neon_vld3_vld4_lane" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld3_vld4_lane"))
+  "cortex_a8_neon_ls_5")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst1_vst2_lane" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vst1_vst2_lane"))
+  "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst3_vst4_lane" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vst3_vst4_lane"))
+  "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2 on cycle 2.
+(define_insn_reservation "neon_vld3_vld4_all_lanes" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_vld3_vld4_all_lanes"))
+  "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2.
+(define_insn_reservation "neon_mcr" 2
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mcr"))
+  "cortex_a8_neon_perm")
+
+;; Instructions using this reservation produce a result at N2.
+(define_insn_reservation "neon_mcr_2_mcrr" 2
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "neon_type" "neon_mcr_2_mcrr"))
+  "cortex_a8_neon_perm_2")
+
+;; Exceptions to the default latencies.
+
+(define_bypass 1 "neon_mcr_2_mcrr"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 1 "neon_mcr"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vld3_vld4_all_lanes"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vld3_vld4_lane"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_vld1_vld2_lane"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_vld3_vld4"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_vld2_4_regs"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vld2_2_regs_vld1_vld2_all_lanes"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vld1_3_4_regs"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 1 "neon_vld1_1_2_regs"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 0 "neon_ldr"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_bp_3cycle"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_bp_2cycle"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 1 "neon_bp_simple"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 9 "neon_fp_vrecps_vrsqrts_qqq"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_fp_vrecps_vrsqrts_ddd"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 9 "neon_fp_vmla_qqq_scalar"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_fp_vmla_ddd_scalar"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 9 "neon_fp_vmla_qqq"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_fp_vmla_ddd"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_fp_vmul_qqd"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_fp_vmul_ddd"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_fp_vsum"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_fp_vadd_qqq_vabs_qq"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_fp_vadd_ddd_vabs_dd"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vsra_vrsra"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_vqshl_vrshl_vqrshl_qqq"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 0 "neon_vshl_ddd"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_shift_3"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_shift_2"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_shift_1"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_mul_qqd_32_scalar"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mul_ddd_16_scalar_32_16_long_scalar"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_mla_qqq_32_qqd_32_scalar"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mla_qqq_8_16"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mul_qqq_8_16_32_ddd_32"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vsma"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_vaba_qqq"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vaba"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vmov"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_vqneg_vqabs"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_int_5"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_int_4"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_int_3"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_int_2"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_int_1"
+               "neon_int_1,\
+               neon_int_4,\
+               neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mul_qqq_8_16_32_ddd_32,\
+               neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+               neon_mla_qqq_8_16,\
+               neon_fp_vadd_ddd_vabs_dd,\
+               neon_fp_vadd_qqq_vabs_qq,\
+               neon_fp_vmla_ddd,\
+               neon_fp_vmla_qqq,\
+               neon_fp_vrecps_vrsqrts_ddd,\
+               neon_fp_vrecps_vrsqrts_qqq")
+

Added: llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,273 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM Cortex-A8 scheduling description.
+;; Copyright (C) 2007 Free Software Foundation, Inc.
+;; Contributed by CodeSourcery.
+
+;; This file is part of GCC.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_automaton "cortex_a8")
+
+;; Only one load/store instruction can be issued per cycle
+;; (although reservation of this unit is only required for single
+;; loads and stores -- see below).
+(define_cpu_unit "cortex_a8_issue_ls" "cortex_a8")
+
+;; Only one branch instruction can be issued per cycle.
+(define_cpu_unit "cortex_a8_issue_branch" "cortex_a8")
+
+;; The two ALU pipelines.
+(define_cpu_unit "cortex_a8_alu0" "cortex_a8")
+(define_cpu_unit "cortex_a8_alu1" "cortex_a8")
+
+;; The usual flow of an instruction through the pipelines.
+(define_reservation "cortex_a8_default"
+                    "cortex_a8_alu0|cortex_a8_alu1")
+
+;; The flow of a branch instruction through the pipelines.
+(define_reservation "cortex_a8_branch"
+                    "(cortex_a8_alu0+cortex_a8_issue_branch)|\
+                     (cortex_a8_alu1+cortex_a8_issue_branch)")
+
+;; The flow of a load or store instruction through the pipeline in
+;; the case where that instruction consists of only one micro-op...
+(define_reservation "cortex_a8_load_store_1"
+                    "(cortex_a8_alu0+cortex_a8_issue_ls)|\
+                     (cortex_a8_alu1+cortex_a8_issue_ls)")
+
+;; ...and in the case of two micro-ops.  We don't need to reserve
+;; cortex_a8_issue_ls here because dual issue is altogether forbidden
+;; during the issue cycle of the first micro-op.  (Instead of modelling
+;; a separate issue unit, we instead reserve alu0 and alu1 to
+;; prevent any other instructions from being issued upon that first cycle.)
+;; Even though the load/store pipeline is usually available in either
+;; ALU pipe, multi-cycle instructions always issue in pipeline 0.  This
+;; reservation is therefore the same as cortex_a8_multiply_2 below.
+(define_reservation "cortex_a8_load_store_2"
+                    "cortex_a8_alu0+cortex_a8_alu1,\
+                     cortex_a8_alu0")
+
+;; The flow of a single-cycle multiplication.
+(define_reservation "cortex_a8_multiply"
+                    "cortex_a8_alu0")
+
+;; The flow of a multiplication instruction that gets decomposed into
+;; two micro-ops.  The two micro-ops will be issued to pipeline 0 on
+;; successive cycles.  Dual issue cannot happen at the same time as the
+;; first of the micro-ops.
+(define_reservation "cortex_a8_multiply_2"
+                    "cortex_a8_alu0+cortex_a8_alu1,\
+                     cortex_a8_alu0")
+
+;; Similarly, the flow of a multiplication instruction that gets
+;; decomposed into three micro-ops.  Dual issue cannot occur except on
+;; the cycle upon which the third micro-op is issued.
+(define_reservation "cortex_a8_multiply_3"
+                    "cortex_a8_alu0+cortex_a8_alu1,\
+                     cortex_a8_alu0+cortex_a8_alu1,\
+                     cortex_a8_alu0")
+
+;; The model given here assumes that all instructions are unconditional.
+
+;; Data processing instructions, but not move instructions.
+
+;; We include CLZ with these since it has the same execution pattern
+;; (source read in E2 and destination available at the end of that cycle).
+(define_insn_reservation "cortex_a8_alu" 2
+  (and (eq_attr "tune" "cortexa8")
+       (ior (and (eq_attr "type" "alu")
+                (not (eq_attr "insn" "mov,mvn")))
+            (eq_attr "insn" "clz")))
+  "cortex_a8_default")
+
+(define_insn_reservation "cortex_a8_alu_shift" 2
+  (and (eq_attr "tune" "cortexa8")
+       (and (eq_attr "type" "alu_shift")
+            (not (eq_attr "insn" "mov,mvn"))))
+  "cortex_a8_default")
+
+(define_insn_reservation "cortex_a8_alu_shift_reg" 2
+  (and (eq_attr "tune" "cortexa8")
+       (and (eq_attr "type" "alu_shift_reg")
+            (not (eq_attr "insn" "mov,mvn"))))
+  "cortex_a8_default")
+
+;; Move instructions.
+
+(define_insn_reservation "cortex_a8_mov" 1
+  (and (eq_attr "tune" "cortexa8")
+       (and (eq_attr "type" "alu,alu_shift,alu_shift_reg")
+            (eq_attr "insn" "mov,mvn")))
+  "cortex_a8_default")
+
+;; Exceptions to the default latencies for data processing instructions.
+
+;; A move followed by an ALU instruction with no early dep.
+;; (Such a pair can be issued in parallel, hence latency zero.)
+(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu")
+(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; An ALU instruction followed by an ALU instruction with no early dep.
+(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
+               "cortex_a8_alu")
+(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
+               "cortex_a8_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
+               "cortex_a8_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; Multiplication instructions.  These are categorized according to their
+;; reservation behaviour and the need below to distinguish certain
+;; varieties for bypasses.  Results are available at the E5 stage
+;; (but some of these are multi-cycle instructions which explains the
+;; latencies below).
+
+(define_insn_reservation "cortex_a8_mul" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "insn" "mul,smulxy,smmul"))
+  "cortex_a8_multiply_2")
+
+(define_insn_reservation "cortex_a8_mla" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "insn" "mla,smlaxy,smlawy,smmla,smlad,smlsd"))
+  "cortex_a8_multiply_2")
+
+(define_insn_reservation "cortex_a8_mull" 7
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "insn" "smull,umull,smlal,umlal,umaal,smlalxy"))
+  "cortex_a8_multiply_3")
+
+(define_insn_reservation "cortex_a8_smulwy" 5
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "insn" "smulwy,smuad,smusd"))
+  "cortex_a8_multiply")
+
+;; smlald and smlsld are multiply-accumulate instructions but do not
+;; received bypassed data from other multiplication results; thus, they
+;; cannot go in cortex_a8_mla above.  (See below for bypass details.)
+(define_insn_reservation "cortex_a8_smlald" 6
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "insn" "smlald,smlsld"))
+  "cortex_a8_multiply_2")
+
+;; A multiply with a single-register result or an MLA, followed by an
+;; MLA with an accumulator dependency, has its result forwarded so two
+;; such instructions can issue back-to-back.
+(define_bypass 1 "cortex_a8_mul,cortex_a8_mla,cortex_a8_smulwy"
+               "cortex_a8_mla"
+               "arm_mac_accumulator_is_mul_result")
+
+;; A multiply followed by an ALU instruction needing the multiply
+;; result only at E2 has lower latency than one needing it at E1.
+(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
+                  cortex_a8_smulwy,cortex_a8_smlald"
+               "cortex_a8_alu")
+(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
+                  cortex_a8_smulwy,cortex_a8_smlald"
+               "cortex_a8_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
+                  cortex_a8_smulwy,cortex_a8_smlald"
+               "cortex_a8_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; Load instructions.
+;; The presence of any register writeback is ignored here.
+
+;; A load result has latency 3 unless the dependent instruction has
+;; no early dep, in which case it is only latency two.
+;; We assume 64-bit alignment for doubleword loads.
+(define_insn_reservation "cortex_a8_load1_2" 3
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "load1,load2,load_byte"))
+  "cortex_a8_load_store_1")
+
+(define_bypass 2 "cortex_a8_load1_2"
+               "cortex_a8_alu")
+(define_bypass 2 "cortex_a8_load1_2"
+               "cortex_a8_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 2 "cortex_a8_load1_2"
+               "cortex_a8_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; We do not currently model the fact that loads with scaled register
+;; offsets that are not LSL #2 have an extra cycle latency (they issue
+;; as two micro-ops).
+
+;; A load multiple of three registers is usually issued as two micro-ops.
+;; The first register will be available at E3 of the first iteration,
+;; the second at E3 of the second iteration, and the third at E4 of
+;; the second iteration.  A load multiple of four registers is usually
+;; issued as two micro-ops.
+(define_insn_reservation "cortex_a8_load3_4" 5
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "load3,load4"))
+  "cortex_a8_load_store_2")
+
+(define_bypass 4 "cortex_a8_load3_4"
+               "cortex_a8_alu")
+(define_bypass 4 "cortex_a8_load3_4"
+               "cortex_a8_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 4 "cortex_a8_load3_4"
+               "cortex_a8_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; Store instructions.
+;; Writeback is again ignored.
+
+(define_insn_reservation "cortex_a8_store1_2" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "store1,store2"))
+  "cortex_a8_load_store_1")
+
+(define_insn_reservation "cortex_a8_store3_4" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "store3,store4"))
+  "cortex_a8_load_store_2")
+
+;; An ALU instruction acting as a producer for a store instruction
+;; that only uses the result as the value to be stored (as opposed to
+;; using it to calculate the address) has latency zero; the store
+;; reads the value to be stored at the start of E3 and the ALU insn
+;; writes it at the end of E2.  Move instructions actually produce the
+;; result at the end of E1, but since we don't have delay slots, the
+;; scheduling behaviour will be the same.
+(define_bypass 0 "cortex_a8_alu,cortex_a8_alu_shift,\
+                  cortex_a8_alu_shift_reg,cortex_a8_mov"
+               "cortex_a8_store1_2,cortex_a8_store3_4"
+               "arm_no_early_store_addr_dep")
+
+;; Branch instructions
+
+(define_insn_reservation "cortex_a8_branch" 0
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "branch"))
+  "cortex_a8_branch")
+
+;; Call latencies are not predictable.  A semi-arbitrary very large
+;; number is used as "positive infinity" so that everything should be
+;; finished by the time of return.
+(define_insn_reservation "cortex_a8_call" 32
+  (and (eq_attr "tune" "cortexa8")
+       (eq_attr "type" "call"))
+  "cortex_a8_issue_branch")
+
+;; NEON (including VFP) instructions.
+
+(include "cortex-a8-neon.md")
+

Added: llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,289 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM Cortex-R4 scheduling description.
+;; Copyright (C) 2007 Free Software Foundation, Inc.
+;; Contributed by CodeSourcery.
+
+;; This file is part of GCC.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_automaton "cortex_r4")
+
+;; We approximate the dual-issue constraints of this core using four
+;; "issue units" and a reservation matrix as follows.  The numbers indicate
+;; the instruction groups' preferences in order.  Multiple entries for
+;; the same numbered preference indicate units that must be reserved
+;; together.
+;;
+;; Issue unit:		A	B	C	ALU
+;;
+;; ALU w/o reg shift	1st	2nd		1st and 2nd
+;; ALU w/ reg shift	1st	2nd	2nd	1st and 2nd
+;; Moves		1st	2nd		2nd
+;; Multiplication	1st			1st
+;; Division		1st			1st
+;; Load/store single	1st		1st
+;; Other load/store	1st	1st
+;; Branches			1st
+
+(define_cpu_unit "cortex_r4_issue_a" "cortex_r4")
+(define_cpu_unit "cortex_r4_issue_b" "cortex_r4")
+(define_cpu_unit "cortex_r4_issue_c" "cortex_r4")
+(define_cpu_unit "cortex_r4_issue_alu" "cortex_r4")
+
+(define_reservation "cortex_r4_alu"
+                    "(cortex_r4_issue_a+cortex_r4_issue_alu)|\
+                     (cortex_r4_issue_b+cortex_r4_issue_alu)")
+(define_reservation "cortex_r4_alu_shift_reg"
+                    "(cortex_r4_issue_a+cortex_r4_issue_alu)|\
+                     (cortex_r4_issue_b+cortex_r4_issue_c+\
+                      cortex_r4_issue_alu)")
+(define_reservation "cortex_r4_mov"
+                    "cortex_r4_issue_a|(cortex_r4_issue_b+\
+                     cortex_r4_issue_alu)")
+(define_reservation "cortex_r4_mul" "cortex_r4_issue_a+cortex_r4_issue_alu")
+(define_reservation "cortex_r4_mul_2"
+                    "(cortex_r4_issue_a+cortex_r4_issue_alu)*2")
+;; Division instructions execute out-of-order with respect to the
+;; rest of the pipeline and only require reservations on their first and
+;; final cycles.
+(define_reservation "cortex_r4_div_9"
+                    "cortex_r4_issue_a+cortex_r4_issue_alu,\
+                     nothing*7,\
+                     cortex_r4_issue_a+cortex_r4_issue_alu")
+(define_reservation "cortex_r4_div_10"
+                    "cortex_r4_issue_a+cortex_r4_issue_alu,\
+                     nothing*8,\
+                     cortex_r4_issue_a+cortex_r4_issue_alu")
+(define_reservation "cortex_r4_load_store"
+                    "cortex_r4_issue_a+cortex_r4_issue_c")
+(define_reservation "cortex_r4_load_store_2"
+                    "(cortex_r4_issue_a+cortex_r4_issue_b)*2")
+(define_reservation "cortex_r4_branch" "cortex_r4_issue_b")
+
+;; We assume that all instructions are unconditional.
+
+;; Data processing instructions.  Moves without shifts are kept separate
+;; for the purposes of the dual-issue constraints above.
+(define_insn_reservation "cortex_r4_alu" 2
+  (and (eq_attr "tune" "cortexr4")
+       (and (eq_attr "type" "alu")
+            (not (eq_attr "insn" "mov"))))
+  "cortex_r4_alu")
+
+(define_insn_reservation "cortex_r4_mov" 2
+  (and (eq_attr "tune" "cortexr4")
+       (and (eq_attr "type" "alu")
+            (eq_attr "insn" "mov")))
+  "cortex_r4_mov")
+
+(define_insn_reservation "cortex_r4_alu_shift" 2
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "alu_shift"))
+  "cortex_r4_alu")
+
+(define_insn_reservation "cortex_r4_alu_shift_reg" 2
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "alu_shift_reg"))
+  "cortex_r4_alu_shift_reg")
+
+;; An ALU instruction followed by an ALU instruction with no early dep.
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+                  cortex_r4_mov"
+               "cortex_r4_alu")
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+                  cortex_r4_mov"
+               "cortex_r4_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+                  cortex_r4_mov"
+               "cortex_r4_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; In terms of availabilities, a consumer mov could theoretically be
+;; issued together with a producer ALU instruction, without stalls.
+;; In practice this cannot happen because mov;add (in that order) is not
+;; eligible for dual issue and furthermore dual issue is not permitted
+;; when a dependency is involved.  We therefore note it as latency one.
+;; A mov followed by another of the same is also latency one.
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+                  cortex_r4_mov"
+               "cortex_r4_mov")
+
+;; qadd, qdadd, qsub and qdsub are not currently emitted, and neither are
+;; media data processing instructions nor sad instructions.
+
+;; Multiplication instructions.
+
+(define_insn_reservation "cortex_r4_mul_4" 4
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "mul,smmul"))
+  "cortex_r4_mul_2")
+
+(define_insn_reservation "cortex_r4_mul_3" 3
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "smulxy,smulwy,smuad,smusd"))
+  "cortex_r4_mul")
+
+(define_insn_reservation "cortex_r4_mla_4" 4
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "mla,smmla,smmls"))
+  "cortex_r4_mul_2")
+
+(define_insn_reservation "cortex_r4_mla_3" 3
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "smlaxy,smlawy,smlad,smlsd"))
+  "cortex_r4_mul")
+
+(define_insn_reservation "cortex_r4_smlald" 3
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "smlald,smlsld"))
+  "cortex_r4_mul")
+
+(define_insn_reservation "cortex_r4_mull" 4
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "smull,umull,umlal,umaal"))
+  "cortex_r4_mul_2")
+
+;; A multiply or an MLA with a single-register result, followed by an
+;; MLA with an accumulator dependency, has its result forwarded.
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3"
+               "cortex_r4_mla_3,cortex_r4_mla_4"
+               "arm_mac_accumulator_is_mul_result")
+
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4"
+               "cortex_r4_mla_3,cortex_r4_mla_4"
+               "arm_mac_accumulator_is_mul_result")
+
+;; A multiply followed by an ALU instruction needing the multiply
+;; result only at ALU has lower latency than one needing it at Shift.
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+               "cortex_r4_alu")
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+               "cortex_r4_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+               "cortex_r4_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+               "cortex_r4_alu")
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+               "cortex_r4_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+               "cortex_r4_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; A multiply followed by a mov has one cycle lower latency again.
+(define_bypass 1 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+               "cortex_r4_mov")
+(define_bypass 2 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+               "cortex_r4_mov")
+
+;; We guess that division of A/B using sdiv or udiv, on average, 
+;; is performed with B having ten more leading zeros than A.
+;; This gives a latency of nine for udiv and ten for sdiv.
+(define_insn_reservation "cortex_r4_udiv" 9
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "udiv"))
+  "cortex_r4_div_9")
+
+(define_insn_reservation "cortex_r4_sdiv" 10
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "insn" "sdiv"))
+  "cortex_r4_div_10")
+
+;; Branches.  We assume correct prediction.
+
+(define_insn_reservation "cortex_r4_branch" 0
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "branch"))
+  "cortex_r4_branch")
+
+;; Call latencies are not predictable.  A semi-arbitrary very large
+;; number is used as "positive infinity" so that everything should be
+;; finished by the time of return.
+(define_insn_reservation "cortex_r4_call" 32
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "call"))
+  "nothing")
+
+;; Status register access instructions are not currently emitted.
+
+;; Load instructions.
+;; We do not model the "addr_md_3cycle" cases and assume that
+;; accesses following are correctly aligned.
+
+(define_insn_reservation "cortex_r4_load_1_2" 3
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "load1,load2"))
+  "cortex_r4_load_store")
+
+(define_insn_reservation "cortex_r4_load_3_4" 4
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "load3,load4"))
+  "cortex_r4_load_store_2")
+
+;; If a producing load is followed by an instruction consuming only
+;; as a Normal Reg, there is one fewer cycle of latency.
+
+(define_bypass 2 "cortex_r4_load_1_2"
+               "cortex_r4_alu")
+(define_bypass 2 "cortex_r4_load_1_2"
+               "cortex_r4_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 2 "cortex_r4_load_1_2"
+               "cortex_r4_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+(define_bypass 3 "cortex_r4_load_3_4"
+               "cortex_r4_alu")
+(define_bypass 3 "cortex_r4_load_3_4"
+               "cortex_r4_alu_shift"
+               "arm_no_early_alu_shift_dep")
+(define_bypass 3 "cortex_r4_load_3_4"
+               "cortex_r4_alu_shift_reg"
+               "arm_no_early_alu_shift_value_dep")
+
+;; If a producing load is followed by an instruction consuming only
+;; as a Late Reg, there are two fewer cycles of latency.  Such consumer
+;; instructions are moves and stores.
+
+(define_bypass 1 "cortex_r4_load_1_2"
+               "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4")
+(define_bypass 2 "cortex_r4_load_3_4"
+               "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4")
+
+;; If a producer's result is required as the base or offset of a load,
+;; there is an extra cycle latency.
+
+(define_bypass 3 "cortex_r4_alu,cortex_r4_mov,cortex_r4_alu_shift,\
+                  cortex_r4_alu_shift_reg"
+               "cortex_r4_load_1_2,cortex_r4_load_3_4")
+
+(define_bypass 4 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+               "cortex_r4_load_1_2,cortex_r4_load_3_4")
+
+(define_bypass 5 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+               "cortex_r4_load_1_2,cortex_r4_load_3_4")
+
+;; Store instructions.
+
+(define_insn_reservation "cortex_r4_store_1_2" 0
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "store1,store2"))
+  "cortex_r4_load_store")
+
+(define_insn_reservation "cortex_r4_store_3_4" 0
+  (and (eq_attr "tune" "cortexr4")
+       (eq_attr "type" "store3,store4"))
+  "cortex_r4_load_store_2")
+

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h Wed Jul 22 15:36:27 2009
@@ -52,7 +52,9 @@
 
 #define REGISTER_PREFIX 	""
 
-/* The assembler's names for the registers.  */
+/* The assembler's names for the registers.  Note that the ?xx registers are * there so that VFPv3/NEON registers D16-D31 have the same spacing as D0-D15
+ * (each of which is overlaid on two S registers), although there are no
+ * actual single-precision registers which correspond to D16-D31.  */
 #ifndef REGISTER_NAMES
 #define REGISTER_NAMES				   \
 {				                   \
@@ -73,6 +75,10 @@
   "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15", \
   "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", \
   "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", \
+  "d16", "?16", "d17", "?17", "d18", "?18", "d19", "?19", \
+  "d20", "?20", "d21", "?21", "d22", "?22", "d23", "?23", \
+  "d24", "?24", "d25", "?25", "d26", "?26", "d27", "?27", \
+  "d28", "?28", "d29", "?29", "d30", "?30", "d31", "?31", \
   "vfpcc"					   \
 }
 #endif
@@ -162,22 +168,30 @@
   {"mvdx13", 40},				\
   {"mvdx14", 41},				\
   {"mvdx15", 42},				\
-  {"d0", 63},					\
-  {"d1", 65},					\
-  {"d2", 67},					\
-  {"d3", 69},					\
-  {"d4", 71},					\
-  {"d5", 73},					\
-  {"d6", 75},					\
-  {"d7", 77},					\
-  {"d8", 79},					\
-  {"d9", 81},					\
-  {"d10", 83},					\
-  {"d11", 85},					\
-  {"d12", 87},					\
-  {"d13", 89},					\
-  {"d14", 91},					\
-  {"d15", 93},					\
+  {"d0", 63}, {"q0", 63},                       \
+  {"d1", 65},                                   \
+  {"d2", 67}, {"q1", 67},                       \
+  {"d3", 69},                                   \
+  {"d4", 71}, {"q2", 71},                       \
+  {"d5", 73},                                   \
+  {"d6", 75}, {"q3", 75},                       \
+  {"d7", 77},                                   \
+  {"d8", 79}, {"q4", 79},                       \
+  {"d9", 81},                                   \
+  {"d10", 83}, {"q5", 83},                      \
+  {"d11", 85},                                  \
+  {"d12", 87}, {"q6", 87},                      \
+  {"d13", 89},                                  \
+  {"d14", 91}, {"q7", 91},                      \
+  {"d15", 93},                                  \
+  {"q8", 95},                                   \
+  {"q9", 99},                                   \
+  {"q10", 103},                                 \
+  {"q11", 107},                                 \
+  {"q12", 111},                                 \
+  {"q13", 115},                                 \
+  {"q14", 119},                                 \
+  {"q15", 123}                                  \
 }
 #endif
 
@@ -200,6 +214,13 @@
    march=armv5tej:armv5;			\
    march=xscale:xscale;				\
    march=armv4t:armv4t;				\
+   march=armv7:armv7;                           \
+   march=armv7-a:armv7;                         \
+   march=armv7-r:armv7;                         \
+   march=armv7-m:armv7;                         \
+   march=armv7a:armv7;                          \
+   march=armv7r:armv7;                          \
+   march=armv7m:armv7;                          \
    mcpu=arm10tdmi:armv5;			\
    mcpu=arm1020t:armv5;				\
    mcpu=arm9e:armv5;				\
@@ -216,9 +237,12 @@
    mcpu=arm1136jf-s:armv6;			\
    mcpu=arm1176jz-s:armv6;			\
    mcpu=arm1176jzf-s:armv6;			\
+   mcpu=cortex-a8:armv7;			\
+   mcpu=cortex-r4:armv7;			\
+   mcpu=cortex-m3:armv7;			\
    :arm -force_cpusubtype_ALL}"
 
-#define DARWIN_MINVERSION_SPEC "2.0"
+#define DARWIN_MINVERSION_SPEC "3.0"
 
 /* Default cc1 option for specifying minimum version number.  */
 #define DARWIN_CC1_MINVERSION_SPEC "-miphoneos-version-min=%(darwin_minversion)"
@@ -235,7 +259,7 @@
 #define SUBTARGET_EXTRA_SPECS			\
   DARWIN_EXTRA_SPECS				\
   { "darwin_arch", DARWIN_SUBARCH_SPEC },	\
-  { "darwin_subarch", DARWIN_SUBARCH_SPEC },
+  { "darwin_subarch", DARWIN_SUBARCH_SPEC }
 
 /* This can go away once we can feature test the assembler correctly.  */
 #define ASM_DEBUG_SPEC ""
@@ -245,7 +269,7 @@
   if (1)								\
   {									\
     if (!darwin_macosx_version_min && !darwin_iphoneos_version_min)	\
-      darwin_iphoneos_version_min = "2.0";				\
+      darwin_iphoneos_version_min = "3.0";				\
     if (MACHO_DYNAMIC_NO_PIC_P)						\
       {									\
         if (flag_pic)							\
@@ -265,23 +289,18 @@
     /* Use -mlongcalls for kexts */					\
     if (flag_mkernel || flag_apple_kext)				\
       target_flags |= MASK_LONG_CALLS;					\
+    /* GCC 4.2+ only works with SDK 3.0+ */				\
+    if (darwin_iphoneos_version_min &&					\
+        strverscmp (darwin_iphoneos_version_min, "3.0") < 0)		\
+      darwin_reserve_r9_on_v6 = 1;					\
   }									\
 } while(0)
 
-/* We reserve r9 on darwin for thread local data.  */
+/* APPLE LOCAL begin 5571707 Allow R9 as caller-saved register */
 #undef SUBTARGET_CONDITIONAL_REGISTER_USAGE
 #define SUBTARGET_CONDITIONAL_REGISTER_USAGE			\
-  if (1)							\
-    {								\
-      fixed_regs[9]     = 1;					\
-      call_used_regs[9] = 1;					\
-    }								\
-  if (TARGET_THUMB)						\
-    {								\
-      fixed_regs[THUMB_HARD_FRAME_POINTER_REGNUM] = 1;		\
-      call_used_regs[THUMB_HARD_FRAME_POINTER_REGNUM] = 1;	\
-      global_regs[THUMB_HARD_FRAME_POINTER_REGNUM] = 1;		\
-    }
+  arm_darwin_subtarget_conditional_register_usage();
+/* APPLE LOCAL end 5571707 Allow R9 as caller-saved register */
 
 #undef TARGET_MACHO
 #define TARGET_MACHO 1
@@ -308,12 +327,13 @@
 #undef SUBTARGET_ASM_DECLARE_FUNCTION_NAME
 #define SUBTARGET_ASM_DECLARE_FUNCTION_NAME ARM_DECLARE_FUNCTION_NAME
 
-/* We default to VFP */
-#define FPUTYPE_DEFAULT FPUTYPE_VFP
+/* APPLE LOCAL begin 6093388 -mfpu=neon default for v7a */
+/* We default to VFP for v6, NEON for v7 */
+#define FPUTYPE_DEFAULT (arm_arch7a ? FPUTYPE_NEON : FPUTYPE_VFP)
 
 #undef TARGET_DEFAULT_FLOAT_ABI
-#define TARGET_DEFAULT_FLOAT_ABI (arm_arch6 ? ARM_FLOAT_ABI_SOFTFP : ARM_FLOAT_ABI_SOFT)
-
+#define TARGET_DEFAULT_FLOAT_ABI ((arm_arch6 || arm_arch7a) ? ARM_FLOAT_ABI_SOFTFP : ARM_FLOAT_ABI_SOFT)
+/* APPLE LOCAL end 6093388 -mfpu=neon default for v7a */
 #undef REGISTER_TARGET_PRAGMAS
 #define REGISTER_TARGET_PRAGMAS DARWIN_REGISTER_TARGET_PRAGMAS
 

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/elf.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/elf.h?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/elf.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/elf.h Wed Jul 22 15:36:27 2009
@@ -96,8 +96,11 @@
 /* Define this macro if jump tables (for `tablejump' insns) should be
    output in the text section, along with the assembler instructions.
    Otherwise, the readonly data section is used.  */
-/* We put ARM jump tables in the text section, because it makes the code
-   more efficient, but for Thumb it's better to put them out of band.  */
+/* APPLE LOCAL begin v7 support. Merge from Codesourcery */
+/* We put ARM and Thumb-2 jump tables in the text section, because it makes
+   the code more efficient, but for Thumb-1 it's better to put them out of
+   band.  */
+/* APPLE LOCAL end v7 support. Merge from Codesourcery */
 /* APPLE LOCAL begin ARM compact switch tables */
 /* The above is no longer true.  */
 #define JUMP_TABLES_IN_TEXT_SECTION (TARGET_EITHER)

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md Wed Jul 22 15:36:27 2009
@@ -22,6 +22,12 @@
 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 ;; Boston, MA 02110-1301, USA.
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; Some FPA mnemonics are ambiguous between conditional infixes and
+;; conditional suffixes.  All instructions use a conditional infix,
+;; even in unified assembly mode.
+
+;; APPLE LOCAL end v7 support. Merge from mainline
 ;; FPA automaton.
 (define_automaton "armfp")
 
@@ -101,7 +107,8 @@
   [(set (match_operand:SF          0 "s_register_operand" "=f,f")
 	(plus:SF (match_operand:SF 1 "s_register_operand" "%f,f")
 		 (match_operand:SF 2 "arm_float_add_operand"    "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    adf%?s\\t%0, %1, %2
    suf%?s\\t%0, %1, #%N2"
@@ -113,7 +120,8 @@
   [(set (match_operand:DF          0 "s_register_operand" "=f,f")
 	(plus:DF (match_operand:DF 1 "s_register_operand" "%f,f")
 		 (match_operand:DF 2 "arm_float_add_operand"    "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    adf%?d\\t%0, %1, %2
    suf%?d\\t%0, %1, #%N2"
@@ -126,7 +134,8 @@
 	(plus:DF (float_extend:DF
 		  (match_operand:SF 1 "s_register_operand"  "f,f"))
 		 (match_operand:DF  2 "arm_float_add_operand"    "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    adf%?d\\t%0, %1, %2
    suf%?d\\t%0, %1, #%N2"
@@ -139,7 +148,8 @@
 	(plus:DF (match_operand:DF  1 "s_register_operand"  "f")
 		 (float_extend:DF
 		  (match_operand:SF 2 "s_register_operand"  "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "adf%?d\\t%0, %1, %2"
   [(set_attr "type" "farith")
    (set_attr "predicable" "yes")]
@@ -151,7 +161,8 @@
 		  (match_operand:SF 1 "s_register_operand" "f"))
 		 (float_extend:DF
 		  (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "adf%?d\\t%0, %1, %2"
   [(set_attr "type" "farith")
    (set_attr "predicable" "yes")]
@@ -161,7 +172,8 @@
   [(set (match_operand:SF 0 "s_register_operand" "=f,f")
 	(minus:SF (match_operand:SF 1 "arm_float_rhs_operand" "f,G")
 		  (match_operand:SF 2 "arm_float_rhs_operand" "fG,f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    suf%?s\\t%0, %1, %2
    rsf%?s\\t%0, %2, %1"
@@ -172,7 +184,8 @@
   [(set (match_operand:DF           0 "s_register_operand" "=f,f")
 	(minus:DF (match_operand:DF 1 "arm_float_rhs_operand"     "f,G")
 		  (match_operand:DF 2 "arm_float_rhs_operand"    "fG,f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    suf%?d\\t%0, %1, %2
    rsf%?d\\t%0, %2, %1"
@@ -185,7 +198,8 @@
 	(minus:DF (float_extend:DF
 		   (match_operand:SF 1 "s_register_operand"  "f"))
 		  (match_operand:DF  2 "arm_float_rhs_operand"    "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "suf%?d\\t%0, %1, %2"
   [(set_attr "type" "farith")
    (set_attr "predicable" "yes")]
@@ -196,7 +210,8 @@
 	(minus:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G")
 		  (float_extend:DF
 		   (match_operand:SF 2 "s_register_operand" "f,f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    suf%?d\\t%0, %1, %2
    rsf%?d\\t%0, %2, %1"
@@ -210,7 +225,8 @@
 		   (match_operand:SF 1 "s_register_operand" "f"))
 		  (float_extend:DF
 		   (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "suf%?d\\t%0, %1, %2"
   [(set_attr "type" "farith")
    (set_attr "predicable" "yes")]
@@ -220,7 +236,8 @@
   [(set (match_operand:SF 0 "s_register_operand" "=f")
 	(mult:SF (match_operand:SF 1 "s_register_operand" "f")
 		 (match_operand:SF 2 "arm_float_rhs_operand" "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "fml%?s\\t%0, %1, %2"
   [(set_attr "type" "ffmul")
    (set_attr "predicable" "yes")]
@@ -230,7 +247,8 @@
   [(set (match_operand:DF 0 "s_register_operand" "=f")
 	(mult:DF (match_operand:DF 1 "s_register_operand" "f")
 		 (match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "muf%?d\\t%0, %1, %2"
   [(set_attr "type" "fmul")
    (set_attr "predicable" "yes")]
@@ -241,7 +259,8 @@
 	(mult:DF (float_extend:DF
 		  (match_operand:SF 1 "s_register_operand" "f"))
 		 (match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "muf%?d\\t%0, %1, %2"
   [(set_attr "type" "fmul")
    (set_attr "predicable" "yes")]
@@ -252,7 +271,8 @@
 	(mult:DF (match_operand:DF 1 "s_register_operand" "f")
 		 (float_extend:DF
 		  (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "muf%?d\\t%0, %1, %2"
   [(set_attr "type" "fmul")
    (set_attr "predicable" "yes")]
@@ -263,7 +283,8 @@
 	(mult:DF
 	 (float_extend:DF (match_operand:SF 1 "s_register_operand" "f"))
 	 (float_extend:DF (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "muf%?d\\t%0, %1, %2"
   [(set_attr "type" "fmul")
    (set_attr "predicable" "yes")]
@@ -275,7 +296,8 @@
   [(set (match_operand:SF 0 "s_register_operand" "=f,f")
 	(div:SF (match_operand:SF 1 "arm_float_rhs_operand" "f,G")
 		(match_operand:SF 2 "arm_float_rhs_operand" "fG,f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    fdv%?s\\t%0, %1, %2
    frd%?s\\t%0, %2, %1"
@@ -287,7 +309,8 @@
   [(set (match_operand:DF 0 "s_register_operand" "=f,f")
 	(div:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G")
 		(match_operand:DF 2 "arm_float_rhs_operand" "fG,f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    dvf%?d\\t%0, %1, %2
    rdf%?d\\t%0, %2, %1"
@@ -300,7 +323,8 @@
 	(div:DF (float_extend:DF
 		 (match_operand:SF 1 "s_register_operand" "f"))
 		(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "dvf%?d\\t%0, %1, %2"
   [(set_attr "type" "fdivd")
    (set_attr "predicable" "yes")]
@@ -311,7 +335,8 @@
 	(div:DF (match_operand:DF 1 "arm_float_rhs_operand" "fG")
 		(float_extend:DF
 		 (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "rdf%?d\\t%0, %2, %1"
   [(set_attr "type" "fdivd")
    (set_attr "predicable" "yes")]
@@ -323,7 +348,8 @@
 		 (match_operand:SF 1 "s_register_operand" "f"))
 		(float_extend:DF
 		 (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "dvf%?d\\t%0, %1, %2"
   [(set_attr "type" "fdivd")
    (set_attr "predicable" "yes")]
@@ -333,7 +359,8 @@
   [(set (match_operand:SF 0 "s_register_operand" "=f")
 	(mod:SF (match_operand:SF 1 "s_register_operand" "f")
 		(match_operand:SF 2 "arm_float_rhs_operand" "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "rmf%?s\\t%0, %1, %2"
   [(set_attr "type" "fdivs")
    (set_attr "predicable" "yes")]
@@ -343,7 +370,8 @@
   [(set (match_operand:DF 0 "s_register_operand" "=f")
 	(mod:DF (match_operand:DF 1 "s_register_operand" "f")
 		(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "rmf%?d\\t%0, %1, %2"
   [(set_attr "type" "fdivd")
    (set_attr "predicable" "yes")]
@@ -354,7 +382,8 @@
 	(mod:DF (float_extend:DF
 		 (match_operand:SF 1 "s_register_operand" "f"))
 		(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "rmf%?d\\t%0, %1, %2"
   [(set_attr "type" "fdivd")
    (set_attr "predicable" "yes")]
@@ -365,7 +394,8 @@
 	(mod:DF (match_operand:DF 1 "s_register_operand" "f")
 		(float_extend:DF
 		 (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "rmf%?d\\t%0, %1, %2"
   [(set_attr "type" "fdivd")
    (set_attr "predicable" "yes")]
@@ -377,7 +407,8 @@
 		 (match_operand:SF 1 "s_register_operand" "f"))
 		(float_extend:DF
 		 (match_operand:SF 2 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "rmf%?d\\t%0, %1, %2"
   [(set_attr "type" "fdivd")
    (set_attr "predicable" "yes")]
@@ -386,7 +417,8 @@
 (define_insn "*negsf2_fpa"
   [(set (match_operand:SF         0 "s_register_operand" "=f")
 	(neg:SF (match_operand:SF 1 "s_register_operand" "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "mnf%?s\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -395,7 +427,8 @@
 (define_insn "*negdf2_fpa"
   [(set (match_operand:DF         0 "s_register_operand" "=f")
 	(neg:DF (match_operand:DF 1 "s_register_operand" "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "mnf%?d\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -405,7 +438,8 @@
   [(set (match_operand:DF 0 "s_register_operand" "=f")
 	(neg:DF (float_extend:DF
 		 (match_operand:SF 1 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "mnf%?d\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -414,7 +448,8 @@
 (define_insn "*abssf2_fpa"
   [(set (match_operand:SF          0 "s_register_operand" "=f")
 	 (abs:SF (match_operand:SF 1 "s_register_operand" "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "abs%?s\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -423,7 +458,8 @@
 (define_insn "*absdf2_fpa"
   [(set (match_operand:DF         0 "s_register_operand" "=f")
 	(abs:DF (match_operand:DF 1 "s_register_operand" "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "abs%?d\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -433,7 +469,8 @@
   [(set (match_operand:DF 0 "s_register_operand" "=f")
 	(abs:DF (float_extend:DF
 		 (match_operand:SF 1 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "abs%?d\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -442,7 +479,8 @@
 (define_insn "*sqrtsf2_fpa"
   [(set (match_operand:SF 0 "s_register_operand" "=f")
 	(sqrt:SF (match_operand:SF 1 "s_register_operand" "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "sqt%?s\\t%0, %1"
   [(set_attr "type" "float_em")
    (set_attr "predicable" "yes")]
@@ -451,7 +489,8 @@
 (define_insn "*sqrtdf2_fpa"
   [(set (match_operand:DF 0 "s_register_operand" "=f")
 	(sqrt:DF (match_operand:DF 1 "s_register_operand" "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "sqt%?d\\t%0, %1"
   [(set_attr "type" "float_em")
    (set_attr "predicable" "yes")]
@@ -461,7 +500,8 @@
   [(set (match_operand:DF 0 "s_register_operand" "=f")
 	(sqrt:DF (float_extend:DF
 		  (match_operand:SF 1 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "sqt%?d\\t%0, %1"
   [(set_attr "type" "float_em")
    (set_attr "predicable" "yes")]
@@ -470,7 +510,8 @@
 (define_insn "*floatsisf2_fpa"
   [(set (match_operand:SF           0 "s_register_operand" "=f")
 	(float:SF (match_operand:SI 1 "s_register_operand" "r")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "flt%?s\\t%0, %1"
   [(set_attr "type" "r_2_f")
    (set_attr "predicable" "yes")]
@@ -479,7 +520,8 @@
 (define_insn "*floatsidf2_fpa"
   [(set (match_operand:DF           0 "s_register_operand" "=f")
 	(float:DF (match_operand:SI 1 "s_register_operand" "r")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "flt%?d\\t%0, %1"
   [(set_attr "type" "r_2_f")
    (set_attr "predicable" "yes")]
@@ -488,7 +530,8 @@
 (define_insn "*fix_truncsfsi2_fpa"
   [(set (match_operand:SI         0 "s_register_operand" "=r")
 	(fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "fix%?z\\t%0, %1"
   [(set_attr "type" "f_2_r")
    (set_attr "predicable" "yes")]
@@ -497,7 +540,8 @@
 (define_insn "*fix_truncdfsi2_fpa"
   [(set (match_operand:SI         0 "s_register_operand" "=r")
 	(fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "fix%?z\\t%0, %1"
   [(set_attr "type" "f_2_r")
    (set_attr "predicable" "yes")]
@@ -507,7 +551,8 @@
   [(set (match_operand:SF 0 "s_register_operand" "=f")
 	(float_truncate:SF
 	 (match_operand:DF 1 "s_register_operand" "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "mvf%?s\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -516,7 +561,8 @@
 (define_insn "*extendsfdf2_fpa"
   [(set (match_operand:DF                  0 "s_register_operand" "=f")
 	(float_extend:DF (match_operand:SF 1 "s_register_operand"  "f")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "mvf%?d\\t%0, %1"
   [(set_attr "type" "ffarith")
    (set_attr "predicable" "yes")]
@@ -556,13 +602,14 @@
    && TARGET_HARD_FLOAT && TARGET_FPA
    && (GET_CODE (operands[0]) != MEM
        || register_operand (operands[1], DFmode))"
+;; APPLE LOCAL begin v7 support. Merge from mainline
   "*
   {
   switch (which_alternative)
     {
     default:
-    case 0: return \"ldm%?ia\\t%m1, %M0\\t%@ double\";
-    case 1: return \"stm%?ia\\t%m0, %M1\\t%@ double\";
+    case 0: return \"ldm%(ia%)\\t%m1, %M0\\t%@ double\";
+    case 1: return \"stm%(ia%)\\t%m0, %M1\\t%@ double\";
     case 2: return \"#\";
     case 3: case 4: return output_move_double (operands);
     case 5: return \"mvf%?d\\t%0, %1\";
@@ -574,6 +621,7 @@
     }
   }
   "
+;; APPLE LOCAL end v7 support. Merge from mainline
   [(set_attr "length" "4,4,8,8,8,4,4,4,4,8,8")
    (set_attr "predicable" "yes")
    (set_attr "type"
@@ -609,11 +657,105 @@
    (set_attr "type" "ffarith,f_load,f_store")]
 )
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; stfs/ldfs always use a conditional infix.  This works around the
+;; ambiguity between "stf pl s" and "sftp ls".
+(define_insn "*thumb2_movsf_fpa"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=f,f,f, m,f,r,r,r, m")
+	(match_operand:SF 1 "general_operand"      "fG,H,mE,f,r,f,r,mE,r"))]
+  "TARGET_THUMB2
+   && TARGET_HARD_FLOAT && TARGET_FPA
+   && (GET_CODE (operands[0]) != MEM
+       || register_operand (operands[1], SFmode))"
+  "@
+   mvf%?s\\t%0, %1
+   mnf%?s\\t%0, #%N1
+   ldf%?s\\t%0, %1
+   stf%?s\\t%1, %0
+   str%?\\t%1, [%|sp, #-4]!\;ldf%?s\\t%0, [%|sp], #4
+   stf%?s\\t%1, [%|sp, #-4]!\;ldr%?\\t%0, [%|sp], #4
+   mov%?\\t%0, %1 @bar
+   ldr%?\\t%0, %1\\t%@ float
+   str%?\\t%1, %0\\t%@ float"
+  [(set_attr "length" "4,4,4,4,8,8,4,4,4")
+   (set_attr "ce_count" "1,1,1,1,2,2,1,1,1")
+   (set_attr "predicable" "yes")
+   (set_attr "type"
+	 "ffarith,ffarith,f_load,f_store,r_mem_f,f_mem_r,*,load1,store1")
+   (set_attr "pool_range" "*,*,1024,*,*,*,*,4096,*")
+   (set_attr "neg_pool_range" "*,*,1012,*,*,*,*,0,*")]
+)
+
+;; Not predicable because we don't know the number of instructions.
+(define_insn "*thumb2_movdf_fpa"
+  [(set (match_operand:DF 0 "nonimmediate_operand"
+						"=r,Q,r,m,r, f, f,f, m,!f,!r")
+	(match_operand:DF 1 "general_operand"
+						"Q, r,r,r,mF,fG,H,mF,f,r, f"))]
+  "TARGET_THUMB2
+   && TARGET_HARD_FLOAT && TARGET_FPA
+   && (GET_CODE (operands[0]) != MEM
+       || register_operand (operands[1], DFmode))"
+  "*
+  {
+  switch (which_alternative)
+    {
+    default:
+    case 0: return \"ldm%(ia%)\\t%m1, %M0\\t%@ double\";
+    case 1: return \"stm%(ia%)\\t%m0, %M1\\t%@ double\";
+    case 2: case 3: case 4: return output_move_double (operands);
+    case 5: return \"mvf%?d\\t%0, %1\";
+    case 6: return \"mnf%?d\\t%0, #%N1\";
+    case 7: return \"ldf%?d\\t%0, %1\";
+    case 8: return \"stf%?d\\t%1, %0\";
+    case 9: return output_mov_double_fpa_from_arm (operands);
+    case 10: return output_mov_double_arm_from_fpa (operands);
+    }
+  }
+  "
+  [(set_attr "length" "4,4,8,8,8,4,4,4,4,8,8")
+   (set_attr "type"
+    "load1,store2,*,store2,load1,ffarith,ffarith,f_load,f_store,r_mem_f,f_mem_r")
+   (set_attr "pool_range" "*,*,*,*,4092,*,*,1024,*,*,*")
+   (set_attr "neg_pool_range" "*,*,*,*,0,*,*,1020,*,*,*")]
+)
+
+;; Saving and restoring the floating point registers in the prologue should
+;; be done in XFmode, even though we don't support that for anything else
+;; (Well, strictly it's 'internal representation', but that's effectively
+;; XFmode).
+;; Not predicable because we don't know the number of instructions.
+
+(define_insn "*thumb2_movxf_fpa"
+  [(set (match_operand:XF 0 "nonimmediate_operand" "=f,f,f,m,f,r,r")
+	(match_operand:XF 1 "general_operand" "fG,H,m,f,r,f,r"))]
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA && reload_completed"
+  "*
+  switch (which_alternative)
+    {
+    default:
+    case 0: return \"mvf%?e\\t%0, %1\";
+    case 1: return \"mnf%?e\\t%0, #%N1\";
+    case 2: return \"ldf%?e\\t%0, %1\";
+    case 3: return \"stf%?e\\t%1, %0\";
+    case 4: return output_mov_long_double_fpa_from_arm (operands);
+    case 5: return output_mov_long_double_arm_from_fpa (operands);
+    case 6: return output_mov_long_double_arm_from_arm (operands);
+    }
+  "
+  [(set_attr "length" "4,4,4,4,8,8,12")
+   (set_attr "type" "ffarith,ffarith,f_load,f_store,r_mem_f,f_mem_r,*")
+   (set_attr "pool_range" "*,*,1024,*,*,*,*")
+   (set_attr "neg_pool_range" "*,*,1004,*,*,*,*")]
+)
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
+
 (define_insn "*cmpsf_fpa"
   [(set (reg:CCFP CC_REGNUM)
 	(compare:CCFP (match_operand:SF 0 "s_register_operand" "f,f")
 		      (match_operand:SF 1 "arm_float_add_operand" "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    cmf%?\\t%0, %1
    cnf%?\\t%0, #%N1"
@@ -625,7 +767,8 @@
   [(set (reg:CCFP CC_REGNUM)
 	(compare:CCFP (match_operand:DF 0 "s_register_operand" "f,f")
 		      (match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    cmf%?\\t%0, %1
    cnf%?\\t%0, #%N1"
@@ -638,7 +781,8 @@
 	(compare:CCFP (float_extend:DF
 		       (match_operand:SF 0 "s_register_operand" "f,f"))
 		      (match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    cmf%?\\t%0, %1
    cnf%?\\t%0, #%N1"
@@ -651,7 +795,8 @@
 	(compare:CCFP (match_operand:DF 0 "s_register_operand" "f")
 		      (float_extend:DF
 		       (match_operand:SF 1 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "cmf%?\\t%0, %1"
   [(set_attr "conds" "set")
    (set_attr "type" "f_2_r")]
@@ -661,7 +806,8 @@
   [(set (reg:CCFPE CC_REGNUM)
 	(compare:CCFPE (match_operand:SF 0 "s_register_operand" "f,f")
 		       (match_operand:SF 1 "arm_float_add_operand" "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    cmf%?e\\t%0, %1
    cnf%?e\\t%0, #%N1"
@@ -673,7 +819,8 @@
   [(set (reg:CCFPE CC_REGNUM)
 	(compare:CCFPE (match_operand:DF 0 "s_register_operand" "f,f")
 		       (match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    cmf%?e\\t%0, %1
    cnf%?e\\t%0, #%N1"
@@ -686,7 +833,8 @@
 	(compare:CCFPE (float_extend:DF
 			(match_operand:SF 0 "s_register_operand" "f,f"))
 		       (match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "@
    cmf%?e\\t%0, %1
    cnf%?e\\t%0, #%N1"
@@ -699,7 +847,8 @@
 	(compare:CCFPE (match_operand:DF 0 "s_register_operand" "f")
 		       (float_extend:DF
 			(match_operand:SF 1 "s_register_operand" "f"))))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
   "cmf%?e\\t%0, %1"
   [(set_attr "conds" "set")
    (set_attr "type" "f_2_r")]
@@ -748,3 +897,49 @@
    (set_attr "type" "ffarith")
    (set_attr "conds" "use")]
 )
+
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_insn "*thumb2_movsfcc_fpa"
+  [(set (match_operand:SF 0 "s_register_operand" "=f,f,f,f,f,f,f,f")
+	(if_then_else:SF
+	 (match_operator 3 "arm_comparison_operator" 
+	  [(match_operand 4 "cc_register" "") (const_int 0)])
+	 (match_operand:SF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H")
+	 (match_operand:SF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))]
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA"
+  "@
+   it\\t%D3\;mvf%D3s\\t%0, %2
+   it\\t%D3\;mnf%D3s\\t%0, #%N2
+   it\\t%d3\;mvf%d3s\\t%0, %1
+   it\\t%d3\;mnf%d3s\\t%0, #%N1
+   ite\\t%d3\;mvf%d3s\\t%0, %1\;mvf%D3s\\t%0, %2
+   ite\\t%d3\;mvf%d3s\\t%0, %1\;mnf%D3s\\t%0, #%N2
+   ite\\t%d3\;mnf%d3s\\t%0, #%N1\;mvf%D3s\\t%0, %2
+   ite\\t%d3\;mnf%d3s\\t%0, #%N1\;mnf%D3s\\t%0, #%N2"
+  [(set_attr "length" "6,6,6,6,10,10,10,10")
+   (set_attr "type" "ffarith")
+   (set_attr "conds" "use")]
+)
+
+(define_insn "*thumb2_movdfcc_fpa"
+  [(set (match_operand:DF 0 "s_register_operand" "=f,f,f,f,f,f,f,f")
+	(if_then_else:DF
+	 (match_operator 3 "arm_comparison_operator"
+	  [(match_operand 4 "cc_register" "") (const_int 0)])
+	 (match_operand:DF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H")
+	 (match_operand:DF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))]
+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA"
+  "@
+   it\\t%D3\;mvf%D3d\\t%0, %2
+   it\\t%D3\;mnf%D3d\\t%0, #%N2
+   it\\t%d3\;mvf%d3d\\t%0, %1
+   it\\t%d3\;mnf%d3d\\t%0, #%N1
+   ite\\t%d3\;mvf%d3d\\t%0, %1\;mvf%D3d\\t%0, %2
+   ite\\t%d3\;mvf%d3d\\t%0, %1\;mnf%D3d\\t%0, #%N2
+   ite\\t%d3\;mnf%d3d\\t%0, #%N1\;mvf%D3d\\t%0, %2
+   ite\\t%d3\;mnf%d3d\\t%0, #%N1\;mnf%D3d\\t%0, #%N2"
+  [(set_attr "length" "6,6,6,6,10,10,10,10")
+   (set_attr "type" "ffarith")
+   (set_attr "conds" "use")]
+)
+;; APPLE LOCAL end v7 support. Merge from Codesourcery

Added: llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh Wed Jul 22 15:36:27 2009
@@ -0,0 +1,21 @@
+#!/bin/sh
+# APPLE LOCAL file 6611402 configurable multilib architectures
+# This recursive function generates all of the pairwise combinations from a
+# list of multilib options. The result is suitable for a multilib
+# exceptions list.
+EXCEPTIONS=
+function gen_exceptions()
+{
+  if [ $# == 1 ] ; then
+    return
+  fi
+  local opt=$1
+  shift 1
+  for opt2 in $@ ; do
+    EXCEPTIONS+="*$opt*/*$opt2* "
+  done
+  gen_exceptions $@
+}
+if [ $# == 0 ] ; then exit ; fi
+gen_exceptions $@
+echo $EXCEPTIONS

Added: llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,42 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM instruction patterns for hardware division 
+;; Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc.
+;; Written by CodeSourcery, LLC.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_insn "divsi3"
+  [(set (match_operand:SI	  0 "s_register_operand" "=r")
+	(div:SI (match_operand:SI 1 "s_register_operand"  "r")
+		(match_operand:SI 2 "s_register_operand"  "r")))]
+  "arm_arch_hwdiv"
+  "sdiv%?\t%0, %1, %2"
+  [(set_attr "predicable" "yes")
+   (set_attr "insn" "sdiv")]
+)
+
+(define_insn "udivsi3"
+  [(set (match_operand:SI	   0 "s_register_operand" "=r")
+	(udiv:SI (match_operand:SI 1 "s_register_operand"  "r")
+		 (match_operand:SI 2 "s_register_operand"  "r")))]
+  "arm_arch_hwdiv"
+  "udiv%?\t%0, %1, %2"
+  [(set_attr "predicable" "yes")
+   (set_attr "insn" "udiv")]
+)
+

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S Wed Jul 22 15:36:27 2009
@@ -91,23 +91,28 @@
 /* APPLE LOCAL ARM MACH assembler */
 ARM_FUNC_ALIAS(aeabi_dadd, adddf3)
 
-1:	stmfd	sp!, {r4, r5, lr}
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+1:	do_push	{r4, r5, lr}
 
 	@ Look for zeroes, equal values, INF, or NAN.
-	mov	r4, xh, lsl #1
-	mov	r5, yh, lsl #1
+	shift1	lsl, r4, xh, #1
+	shift1	lsl, r5, yh, #1
 	teq	r4, r5
+	do_it	eq
 	teqeq	xl, yl
-	orrnes	ip, r4, xl
-	orrnes	ip, r5, yl
-	mvnnes	ip, r4, asr #21
-	mvnnes	ip, r5, asr #21
+	do_it	ne, ttt
+	COND(orr,s,ne)	ip, r4, xl
+	COND(orr,s,ne)	ip, r5, yl
+	COND(mvn,s,ne)	ip, r4, asr #21
+	COND(mvn,s,ne)	ip, r5, asr #21
 	beq	LSYM(Lad_s)
 
 	@ Compute exponent difference.  Make largest exponent in r4,
 	@ corresponding arg in xh-xl, and positive exponent difference in r5.
-	mov	r4, r4, lsr #21
+	shift1	lsr, r4, r4, #21
 	rsbs	r5, r4, r5, lsr #21
+	do_it	lt
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	rsblt	r5, r5, #0
 	ble	1f
 	add	r4, r4, r5
@@ -122,6 +127,8 @@
 	@ already in xh-xl.  We need up to 54 bit to handle proper rounding
 	@ of 0x1p54 - 1.1.
 	cmp	r5, #54
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it hi
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM2(hi, r4, r5)
 
@@ -131,15 +138,27 @@
 	mov	ip, #0x00100000
 	orr	xh, ip, xh, lsr #12
 	beq	1f
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	negs	xl, xl
+	sbc	xh, xh, xh, lsl #1
+#else
 	rsbs	xl, xl, #0
 	rsc	xh, xh, #0
+#endif
 1:
 	tst	yh, #0x80000000
 	mov	yh, yh, lsl #12
 	orr	yh, ip, yh, lsr #12
 	beq	1f
+#if defined(__thumb2__)
+	negs	yl, yl
+	sbc	yh, yh, yh, lsl #1
+#else
 	rsbs	yl, yl, #0
 	rsc	yh, yh, #0
+#endif
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 1:
 	@ If exponent == difference, one or both args were denormalized.
 	@ Since this is not common case, rescale them off line.
@@ -153,27 +172,37 @@
 	@ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip.
 	rsbs	lr, r5, #32
 	blt	1f
-	mov	ip, yl, lsl lr
-	adds	xl, xl, yl, lsr r5
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1	lsl, ip, yl, lr
+	shiftop adds, xl, xl, yl, lsr, r5, yl
 	adc	xh, xh, #0
-	adds	xl, xl, yh, lsl lr
-	adcs	xh, xh, yh, asr r5
+	shiftop adds, xl, xl, yh, lsl, lr, yl
+	shiftop adcs, xh, xh, yh, asr, r5, yh
 	b	2f
 1:	sub	r5, r5, #32
 	add	lr, lr, #32
 	cmp	yl, #1
-	mov	ip, yh, lsl lr
+	shift1	lsl,ip, yh, lr
+	do_it	cs
 	orrcs	ip, ip, #2		@ 2 not 1, to allow lsr #1 later
-	adds	xl, xl, yh, asr r5
+	shiftop adds, xl, xl, yh, asr, r5, yh
 	adcs	xh, xh, yh, asr #31
 2:
 	@ We now have a result in xh-xl-ip.
 	@ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above)
 	and	r5, xh, #0x80000000
 	bpl	LSYM(Lad_p)
+#if defined(__thumb2__)
+	mov	lr, #0
+	negs	ip, ip
+	sbcs	xl, lr, xl
+	sbc	xh, lr, xh
+#else
 	rsbs	ip, ip, #0
 	rscs	xl, xl, #0
 	rsc	xh, xh, #0
+#endif
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 
 	@ Determine how to normalize the result.
 LSYM(Lad_p):
@@ -199,7 +228,10 @@
 	@ Pack final result together.
 LSYM(Lad_e):
 	cmp	ip, #0x80000000
-	moveqs	ip, xl, lsr #1
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq
+	COND(mov,s,eq)	ip, xl, lsr #1
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	adcs	xl, xl, #0
 	adc	xh, xh, r4, lsl #20
 	orr	xh, xh, r5
@@ -243,9 +275,13 @@
 #else
 
 	teq	xh, #0
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, t
 	moveq	xh, xl
 	moveq	xl, #0
 	clz	r3, xh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	addeq	r3, r3, #32
 	sub	r3, r3, #11
 
@@ -261,20 +297,31 @@
 	@ since a register switch happened above.
 	add	ip, r2, #20
 	rsb	r2, r2, #12
-	mov	xl, xh, lsl ip
-	mov	xh, xh, lsr r2
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1	lsl, xl, xh, ip
+	shift1	lsr, xh, xh, r2
 	b	3f
 
 	@ actually shift value left 1 to 20 bits, which might also represent
 	@ 32 to 52 bits if counting the register switch that happened earlier.
 1:	add	r2, r2, #20
-2:	rsble	ip, r2, #32
-	mov	xh, xh, lsl r2
+2:	do_it	le
+	rsble	ip, r2, #32
+	shift1	lsl, xh, xh, r2
+#if defined(__thumb2__)
+	lsr	ip, xl, ip
+	itt	le
+	orrle	xh, xh, ip
+	lslle	xl, xl, r2
+#else
 	orrle	xh, xh, xl, lsr ip
 	movle	xl, xl, lsl r2
+#endif
 
 	@ adjust exponent accordingly.
 3:	subs	r4, r4, r3
+	do_it	ge, tt
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	addge	xh, xh, r4, lsl #20
 	orrge	xh, xh, r5
 	/* APPLE LOCAL ARM MACH assembler */
@@ -291,9 +338,11 @@
 	@ shift result right of 1 to 20 bits, sign is in r5.
 	add	r4, r4, #20
 	rsb	r2, r4, #32
-	mov	xl, xl, lsr r4
-	orr	xl, xl, xh, lsl r2
-	orr	xh, r5, xh, lsr r4
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1  lsr, xl, xl, r4
+	shiftop orr, xl, xl, xh, lsl, r2, yh
+	shiftop orr, xh, r5, xh, lsr, r4, yh
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r4, r5)
 
@@ -301,15 +350,18 @@
 	@ a register switch from xh to xl.
 1:	rsb	r4, r4, #12
 	rsb	r2, r4, #32
-	mov	xl, xl, lsr r2
-	orr	xl, xl, xh, lsl r4
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1  lsr, xl, xl, r2
+	shiftop orr, xl, xl, xh, lsl, r4, yh
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	mov	xh, r5
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r4, r5)
 
 	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
 	@ from xh to xl.
-2:	mov	xl, xh, lsr r4
+	/* APPLE LOCAL v7 support. Merge from mainline */
+2:	shift1  lsr, xl, xh, r4
 	mov	xh, r5
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r4, r5)
@@ -319,6 +371,8 @@
 LSYM(Lad_d):
 	teq	r4, #0
 	eor	yh, yh, #0x00100000
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, te
 	eoreq	xh, xh, #0x00100000
 	addeq	r4, r4, #1
 	subne	r5, r5, #1
@@ -327,15 +381,20 @@
 
 LSYM(Lad_s):
 	mvns	ip, r4, asr #21
-	mvnnes	ip, r5, asr #21
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it ne
+	COND(mvn,s,ne)  ip, r5, asr #21
 	beq	LSYM(Lad_i)
 
 	teq	r4, r5
+	do_it eq
 	teqeq	xl, yl
 	beq	1f
 
 	@ Result is x + 0.0 = x or 0.0 + y = y.
-	orrs	ip, r4, xl
+	orrs  ip, r4, xl
+	do_it eq, t
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	moveq	xh, yh
 	moveq	xl, yl
 	/* APPLE LOCAL ARM MACH assembler */
@@ -344,6 +403,8 @@
 1:	teq	xh, yh
 
 	@ Result is x - x = 0.
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne, tt
 	movne	xh, #0
 	movne	xl, #0
 	/* APPLE LOCAL ARM MACH assembler */
@@ -354,10 +415,14 @@
 	bne	2f
 	movs	xl, xl, lsl #1
 	adcs	xh, xh, xh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it cs
 	orrcs	xh, xh, #0x80000000
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r4, r5)
 2:	adds	r4, r4, #(2 << 21)
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it cc, t
 	addcc	xh, xh, #(1 << 20)
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM2(cc, r4, r5)
@@ -379,13 +444,18 @@
 	@   otherwise return xh-xl (which is INF or -INF)
 LSYM(Lad_i):
 	mvns	ip, r4, asr #21
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne, te
 	movne	xh, yh
 	movne	xl, yl
-	mvneqs	ip, r5, asr #21
+	COND(mvn,s,eq)	ip, r5, asr #21
+	do_it	ne, t
 	movne	yh, xh
 	movne	yl, xl
 	orrs	r4, xl, xh, lsl #12
-	orreqs	r5, yl, yh, lsl #12
+	do_it	eq, te
+	COND(orr,s,eq)	r5, yl, yh, lsl #12
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	teqeq	xh, yh
 	orrne	xh, xh, #0x00080000	@ quiet NAN
 	/* APPLE LOCAL ARM MACH assembler */
@@ -401,9 +471,12 @@
 ARM_FUNC_ALIAS(aeabi_ui2d,floatunsidf)
 
 	teq	r0, #0
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, t
 	moveq	r1, #0
 	RETc(eq)
-	stmfd	sp!, {r4, r5, lr}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push	{r4, r5, lr}
 	mov	r4, #0x400		@ initial exponent
 	add	r4, r4, #(52-1 - 1)
 	mov	r5, #0			@ sign bit is 0
@@ -423,12 +496,17 @@
 ARM_FUNC_ALIAS(aeabi_i2d,floatsidf)
 
 	teq	r0, #0
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, t
 	moveq	r1, #0
 	RETc(eq)
-	stmfd	sp!, {r4, r5, lr}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push	{r4, r5, lr}
 	mov	r4, #0x400		@ initial exponent
 	add	r4, r4, #(52-1 - 1)
 	ands	r5, r0, #0x80000000	@ sign bit in r5
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	mi
 	rsbmi	r0, r0, #0		@ absolute value
 	/* APPLE LOCAL begin ARM MACH assembler */
 #if !defined(__VFP_FP__) || defined(__ARMEB__)
@@ -449,17 +527,21 @@
 	mov	xh, r2, asr #3		@ stretch exponent
 	mov	xh, xh, rrx		@ retrieve sign bit
 	mov	xl, r2, lsl #28		@ retrieve remaining bits
-	andnes	r3, r2, #0xff000000	@ isolate exponent
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne, ttt
+	COND(and,s,ne)	r3, r2, #0xff000000	@ isolate exponent
 	teqne	r3, #0xff000000		@ if not 0, check if INF or NAN
 	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
 	RETc(ne)			@ and return it.
 
 	teq	r2, #0			@ if actually 0
+	do_it	ne, e
 	teqne	r3, #0xff000000		@ or INF or NAN
 	RETc(eq)			@ we are done already.
 
 	@ value was denormalized.  We can normalize it now.
-	stmfd	sp!, {r4, r5, lr}
+	do_push	{r4, r5, lr}
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	mov	r4, #0x380		@ setup corresponding exponent
 	and	r5, xh, #0x80000000	@ move sign bit in r5
 	bic	xh, xh, #0x80000000
@@ -474,7 +556,12 @@
 
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq, t
 	mvfeqd	f0, #0.0
+#else
+	do_it	eq
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 #endif
 	RETc(eq)
 
@@ -483,9 +570,11 @@
 	@ we can return the result in f0 as well as in r0/r1 for backwards
 	@ compatibility.
 	adr	ip, LSYM(f0_ret)
-	stmfd	sp!, {r4, r5, ip, lr}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push	{r4, r5, ip, lr}
 #else
-	stmfd	sp!, {r4, r5, lr}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push	{r4, r5, lr}
 #endif
 
 	mov	r5, #0
@@ -497,7 +586,11 @@
 
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq, t
 	mvfeqd	f0, #0.0
+#else
+	do_it	eq
 #endif
 	RETc(eq)
 
@@ -506,15 +599,21 @@
 	@ we can return the result in f0 as well as in r0/r1 for backwards
 	@ compatibility.
 	adr	ip, LSYM(f0_ret)
-	stmfd	sp!, {r4, r5, ip, lr}
+	do_push	{r4, r5, ip, lr}
 #else
-	stmfd	sp!, {r4, r5, lr}
+	do_push	{r4, r5, lr}
 #endif
 
 	ands	r5, ah, #0x80000000	@ sign bit in r5
 	bpl	2f
+#if defined(__thumb2__)
+	negs	al, al
+	sbc	ah, ah, ah, lsl #1
+#else
 	rsbs	al, al, #0
 	rsc	ah, ah, #0
+#endif
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 2:
 	mov	r4, #0x400		@ initial exponent
 	add	r4, r4, #(52-1 - 1)
@@ -534,16 +633,20 @@
 	@ The value is too big.  Scale it down a bit...
 	mov	r2, #3
 	movs	ip, ip, lsr #3
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
 	addne	r2, r2, #3
 	movs	ip, ip, lsr #3
+	do_it	ne
 	addne	r2, r2, #3
 	add	r2, r2, ip, lsr #3
 
 	rsb	r3, r2, #32
-	mov	ip, xl, lsl r3
-	mov	xl, xl, lsr r2
-	orr	xl, xl, xh, lsl r3
-	mov	xh, xh, lsr r2
+	shift1	lsl, ip, xl, r3
+	shift1	lsr, xl, xl, r2
+	shiftop orr, xl, xl, xh, lsl, r3, lr
+	shift1	lsr, xh, xh, r2
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	add	r4, r4, r2
 	b	LSYM(Lad_p)
 
@@ -552,7 +655,8 @@
 	@ Legacy code expects the result to be returned in f0.  Copy it
 	@ there as well.
 LSYM(f0_ret):
-	stmfd	sp!, {r0, r1}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push	{r0, r1}
 	ldfd	f0, [sp], #8
 	RETLDM
 
@@ -570,13 +674,17 @@
 ARM_FUNC_START muldf3
 /* APPLE LOCAL ARM MACH assembler */
 ARM_FUNC_ALIAS(aeabi_dmul,muldf3)
-	stmfd	sp!, {r4, r5, r6, lr}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push {r4, r5, r6, lr}
 
 	@ Mask out exponents, trap any zero/denormal/INF/NAN.
 	mov	ip, #0xff
 	orr	ip, ip, #0x700
 	ands	r4, ip, xh, lsr #20
-	andnes	r5, ip, yh, lsr #20
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it ne, tte 
+	COND(and,s,ne)  r5, ip, yh, lsr #20
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	teqne	r4, ip
 	teqne	r5, ip
 	bleq	LSYM(Lml_s)
@@ -592,7 +700,10 @@
 	bic	xh, xh, ip, lsl #21
 	bic	yh, yh, ip, lsl #21
 	orrs	r5, xl, xh, lsl #12
-	orrnes	r5, yl, yh, lsl #12
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
+	COND(orr,s,ne)	r5, yl, yh, lsl #12
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	orr	xh, xh, #0x00100000
 	orr	yh, yh, #0x00100000
 	beq	LSYM(Lml_1)
@@ -673,6 +784,8 @@
 	@ The LSBs in ip are only significant for the final rounding.
 	@ Fold them into lr.
 	teq	ip, #0
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne
 	orrne	lr, lr, #1
 
 	@ Adjust result upon the MSB position.
@@ -693,12 +806,16 @@
 
 	@ Check exponent range for under/overflow.
 	subs	ip, r4, #(254 - 1)
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	hi
 	cmphi	ip, #0x700
 	bhi	LSYM(Lml_u)
 
 	@ Round the result, merge final exponent.
 	cmp	lr, #0x80000000
-	moveqs	lr, xl, lsr #1
+	do_it	eq
+	COND(mov,s,eq)	lr, xl, lsr #1
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	adcs	xl, xl, #0
 	adc	xh, xh, r4, lsl #20
 	/* APPLE LOCAL ARM MACH assembler */
@@ -711,7 +828,10 @@
 	orr	xl, xl, yl
 	eor	xh, xh, yh
 	subs	r4, r4, ip, lsr #1
-	rsbgts	r5, r4, ip
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	gt, tt
+	COND(rsb,s,gt)	r5, r4, ip
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	orrgt	xh, xh, r4, lsl #20
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM2(gt, r4, r5, r6)
@@ -727,6 +847,8 @@
 
 	@ Check if denormalized result is possible, otherwise return signed 0.
 	cmn	r4, #(53 + 1)
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	le, tt
 	movle	xl, #0
 	bicle	xh, xh, #0x7fffffff
 	/* APPLE LOCAL ARM MACH assembler */
@@ -742,14 +864,17 @@
 	@ shift result right of 1 to 20 bits, preserve sign bit, round, etc.
 	add	r4, r4, #20
 	rsb	r5, r4, #32
-	mov	r3, xl, lsl r5
-	mov	xl, xl, lsr r4
-	orr	xl, xl, xh, lsl r5
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1	lsl, r3, xl, r5
+	shift1	lsr, xl, xl, r4
+	shiftop orr, xl, xl, xh, lsl, r5, r2
 	and	r2, xh, #0x80000000
 	bic	xh, xh, #0x80000000
 	adds	xl, xl, r3, lsr #31
-	adc	xh, r2, xh, lsr r4
+	shiftop adc, xh, r2, xh, lsr, r4, r6
 	orrs	lr, lr, r3, lsl #1
+	do_it	eq
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	biceq	xl, xl, r3, lsr #31
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r4, r5, r6)
@@ -758,13 +883,16 @@
 	@ a register switch from xh to xl. Then round.
 1:	rsb	r4, r4, #12
 	rsb	r5, r4, #32
-	mov	r3, xl, lsl r4
-	mov	xl, xl, lsr r5
-	orr	xl, xl, xh, lsl r4
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1  lsl, r3, xl, r4
+	shift1  lsr, xl, xl, r5
+	shiftop orr, xl, xl, xh, lsl, r4, r2
 	bic	xh, xh, #0x7fffffff
 	adds	xl, xl, r3, lsr #31
 	adc	xh, xh, #0
 	orrs	lr, lr, r3, lsl #1
+	do_it eq
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	biceq	xl, xl, r3, lsr #31
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r4, r5, r6)
@@ -772,14 +900,17 @@
 	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
 	@ from xh to xl.  Leftover bits are in r3-r6-lr for rounding.
 2:	rsb	r5, r4, #32
-	orr	lr, lr, xl, lsl r5
-	mov	r3, xl, lsr r4
-	orr	r3, r3, xh, lsl r5
-	mov	xl, xh, lsr r4
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shiftop orr, lr, lr, xl, lsl, r5, r2
+	shift1  lsr, r3, xl, r4
+	shiftop orr, r3, r3, xh, lsl, r5, r2
+	shift1  lsr, xl, xh, r4
 	bic	xh, xh, #0x7fffffff
-	bic	xl, xl, xh, lsr r4
+	shiftop bic, xl, xl, xh, lsr, r4, r2
 	add	xl, xl, r3, lsr #31
 	orrs	lr, lr, r3, lsl #1
+	do_it eq
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	biceq	xl, xl, r3, lsr #31
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r4, r5, r6)
@@ -793,15 +924,21 @@
 1:	movs	xl, xl, lsl #1
 	adc	xh, xh, xh
 	tst	xh, #0x00100000
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	subeq	r4, r4, #1
 	beq	1b
 	orr	xh, xh, r6
 	teq	r5, #0
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne
 	movne	pc, lr
 2:	and	r6, yh, #0x80000000
 3:	movs	yl, yl, lsl #1
 	adc	yh, yh, yh
 	tst	yh, #0x00100000
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	subeq	r5, r5, #1
 	beq	3b
 	orr	yh, yh, r6
@@ -811,12 +948,17 @@
 	@ Isolate the INF and NAN cases away
 	teq	r4, ip
 	and	r5, ip, yh, lsr #20
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it ne
 	teqne	r5, ip
 	beq	1f
 
 	@ Here, one or more arguments are either denormalized or zero.
 	orrs	r6, xl, xh, lsl #1
-	orrnes	r6, yl, yh, lsl #1
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it ne
+	COND(orr,s,ne)  r6, yl, yh, lsl #1
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	bne	LSYM(Lml_d)
 
 	@ Result is 0, but determine sign anyway.
@@ -829,9 +971,12 @@
 
 1:	@ One or both args are INF or NAN.
 	orrs	r6, xl, xh, lsl #1
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it eq, te
 	moveq	xl, yl
 	moveq	xh, yh
-	orrnes	r6, yl, yh, lsl #1
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	COND(orr,s,ne)  r6, yl, yh, lsl #1
 	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
 	teq	r4, ip
 	bne	1f
@@ -840,6 +985,8 @@
 1:	teq	r5, ip
 	bne	LSYM(Lml_i)
 	orrs	r6, yl, yh, lsl #12
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne, t
 	movne	xl, yl
 	movne	xh, yh
 	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
@@ -871,13 +1018,17 @@
 /* APPLE LOCAL ARM MACH assembler */
 ARM_FUNC_ALIAS(aeabi_ddiv,divdf3)
 	
-	stmfd	sp!, {r4, r5, r6, lr}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push	{r4, r5, r6, lr}
 
 	@ Mask out exponents, trap any zero/denormal/INF/NAN.
 	mov	ip, #0xff
 	orr	ip, ip, #0x700
 	ands	r4, ip, xh, lsr #20
-	andnes	r5, ip, yh, lsr #20
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne, tte
+	COND(and,s,ne)	r5, ip, yh, lsr #20
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	teqne	r4, ip
 	teqne	r5, ip
 	bleq	LSYM(Ldv_s)
@@ -908,6 +1059,8 @@
 	@ Ensure result will land to known bit position.
 	@ Apply exponent bias accordingly.
 	cmp	r5, yh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	cmpeq	r6, yl
 	adc	r4, r4, #(255 - 2)
 	add	r4, r4, #0x300
@@ -926,6 +1079,8 @@
 	@ The actual division loop.
 1:	subs	lr, r6, yl
 	sbcs	lr, r5, yh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cs, tt
 	subcs	r6, r6, yl
 	movcs	r5, lr
 	orrcs	xl, xl, ip
@@ -933,6 +1088,8 @@
 	mov	yl, yl, rrx
 	subs	lr, r6, yl
 	sbcs	lr, r5, yh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cs, tt
 	subcs	r6, r6, yl
 	movcs	r5, lr
 	orrcs	xl, xl, ip, lsr #1
@@ -940,6 +1097,8 @@
 	mov	yl, yl, rrx
 	subs	lr, r6, yl
 	sbcs	lr, r5, yh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cs, tt
 	subcs	r6, r6, yl
 	movcs	r5, lr
 	orrcs	xl, xl, ip, lsr #2
@@ -947,6 +1106,8 @@
 	mov	yl, yl, rrx
 	subs	lr, r6, yl
 	sbcs	lr, r5, yh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cs, tt
 	subcs	r6, r6, yl
 	movcs	r5, lr
 	orrcs	xl, xl, ip, lsr #3
@@ -973,18 +1134,25 @@
 2:
 	@ Be sure result starts in the high word.
 	tst	xh, #0x00100000
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, t
 	orreq	xh, xh, xl
 	moveq	xl, #0
 3:
 	@ Check exponent range for under/overflow.
 	subs	ip, r4, #(254 - 1)
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	hi
 	cmphi	ip, #0x700
 	bhi	LSYM(Lml_u)
 
 	@ Round the result, merge final exponent.
 	subs	ip, r5, yh
-	subeqs	ip, r6, yl
-	moveqs	ip, xl, lsr #1
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq, t
+	COND(sub,s,eq)	ip, r6, yl
+	COND(mov,s,eq)	ip, xl, lsr #1
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	adcs	xl, xl, #0
 	adc	xh, xh, r4, lsl #20
 	/* APPLE LOCAL ARM MACH assembler */
@@ -995,7 +1163,10 @@
 	and	lr, lr, #0x80000000
 	orr	xh, lr, xh, lsr #12
 	adds	r4, r4, ip, lsr #1
-	rsbgts	r5, r4, ip
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	gt, tt
+	COND(rsb,s,gt)	r5, r4, ip
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	orrgt	xh, xh, r4, lsl #20
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM2(gt, r4, r5, r6)
@@ -1015,6 +1186,8 @@
 LSYM(Ldv_s):
 	and	r5, ip, yh, lsr #20
 	teq	r4, ip
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	teqeq	r5, ip
 	beq	LSYM(Lml_n)		@ INF/NAN / INF/NAN -> NAN
 	teq	r4, ip
@@ -1035,7 +1208,10 @@
 	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
 2:	@ If both are nonzero, we need to normalize and resume above.
 	orrs	r6, xl, xh, lsl #1
-	orrnes	r6, yl, yh, lsl #1
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
+	COND(orr,s,ne)	r6, yl, yh, lsl #1
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	bne	LSYM(Lml_d)
 	@ One or both arguments are 0.
 	orrs	r4, xl, xh, lsl #1
@@ -1078,14 +1254,21 @@
 	mov	ip, xh, lsl #1
 	mvns	ip, ip, asr #21
 	mov	ip, yh, lsl #1
-	mvnnes	ip, ip, asr #21
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
+	COND(mvn,s,ne)	ip, ip, asr #21
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	beq	3f
 
 	@ Test for equality.
 	@ Note that 0.0 is equal to -0.0.
 2:	orrs	ip, xl, xh, lsl #1	@ if x == 0.0 or -0.0
-	orreqs	ip, yl, yh, lsl #1	@ and y == 0.0 or -0.0
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq, e
+	COND(orr,s,eq)	ip, yl, yh, lsl #1	@ and y == 0.0 or -0.0
 	teqne	xh, yh			@ or xh == yh
+	do_it	eq, tt
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	teqeq	xl, yl			@ and xl == yl
 	moveq	r0, #0			@ then equal.
 	RETc(eq)
@@ -1097,10 +1280,16 @@
 	teq	xh, yh
 
 	@ Compare values if same sign
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	pl
 	cmppl	xh, yh
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	cmpeq	xl, yl
 
 	@ Result:
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cs, e
 	movcs	r0, yh, asr #31
 	mvncc	r0, yh, asr #31
 	orr	r0, r0, #1
@@ -1144,13 +1333,17 @@
 
 	@ The status-returning routines are required to preserve all
 	@ registers except ip, lr, and cpsr.
-6:	stmfd	sp!, {r0, lr}
+	/* APPLE LOCAL v7 support. Merge from mainline */
+6:	do_push {r0, lr}
 	ARM_CALL cmpdf2
 	@ Set the Z flag correctly, and the C flag unconditionally.
 	cmp	 r0, #0
 	@ Clear the C flag if the return value was -1, indicating
 	@ that the first operand was smaller than the second.
-	cmnmi	 r0, #0
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it mi
+	cmnmi r0, #0
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r0)
 
@@ -1162,6 +1355,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cdcmple
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, e
 	moveq	r0, #1	@ Equal to.
 	movne	r0, #0	@ Less than, greater than, or unordered.
 	RETLDM
@@ -1172,6 +1367,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cdcmple
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cc, e
 	movcc	r0, #1	@ Less than.
 	movcs	r0, #0	@ Equal to, greater than, or unordered.
 	RETLDM
@@ -1182,6 +1379,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cdcmple
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ls, e
 	movls	r0, #1  @ Less than or equal to.
 	movhi	r0, #0	@ Greater than or unordered.
 	RETLDM
@@ -1192,6 +1391,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cdrcmple
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ls, e
 	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
 	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
 	RETLDM
@@ -1202,6 +1403,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cdrcmple
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cc, e
 	movcc	r0, #1	@ Operand 2 is less than operand 1.
 	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
 			@ or they are unordered.
@@ -1258,7 +1461,10 @@
 	orr	r3, r3, #0x80000000
 	orr	r3, r3, xl, lsr #21
 	tst	xh, #0x80000000		@ the sign bit
-	mov	r0, r3, lsr r2
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1	lsr, r0, r3, r2
+	do_it	ne
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	rsbne	r0, r0, #0
 	RET
 
@@ -1268,6 +1474,8 @@
 2:	orrs	xl, xl, xh, lsl #12
 	bne	4f			@ x is NAN.
 3:	ands	r0, xh, #0x80000000	@ the sign bit
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	moveq	r0, #0x7fffffff		@ maximum signed positive si
 	RET
 
@@ -1299,7 +1507,8 @@
 	mov	r3, xh, lsl #11
 	orr	r3, r3, #0x80000000
 	orr	r3, r3, xl, lsr #21
-	mov	r0, r3, lsr r2
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	shift1	lsr, r0, r3, r2
 	RET
 
 1:	mov	r0, #0
@@ -1327,8 +1536,11 @@
 	@ check exponent range.
 	mov	r2, xh, lsl #1
 	subs	r3, r2, #((1023 - 127) << 21)
-	subcss	ip, r3, #(1 << 21)
-	rsbcss	ip, ip, #(254 << 21)
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	cs, t
+	COND(sub,s,cs)	ip, r3, #(1 << 21)
+	COND(rsb,s,cs)	ip, ip, #(254 << 21)
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	bls	2f			@ value is out of range
 
 1:	@ shift and round mantissa
@@ -1337,6 +1549,8 @@
 	orr	xl, ip, xl, lsr #29
 	cmp	r2, #0x80000000
 	adc	r0, xl, r3, lsl #2
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	biceq	r0, r0, #1
 	RET
 
@@ -1346,6 +1560,8 @@
 
 	@ check if denormalized value is possible
 	adds	r2, r3, #(23 << 21)
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	lt, t
 	andlt	r0, xh, #0x80000000	@ too small, return signed 0.
 	RETc(lt)
 
@@ -1354,13 +1570,20 @@
 	mov	r2, r2, lsr #21
 	rsb	r2, r2, #24
 	rsb	ip, r2, #32
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	lsls	r3, xl, ip
+#else
 	movs	r3, xl, lsl ip
-	mov	xl, xl, lsr r2
+#endif
+	shift1	lsr, xl, xl, r2
+	do_it	ne
 	orrne	xl, xl, #1		@ fold r3 for rounding considerations. 
 	mov	r3, xh, lsl #11
 	mov	r3, r3, lsr #11
-	orr	xl, xl, r3, lsl ip
-	mov	r3, r3, lsr r2
+	shiftop orr, xl, xl, r3, lsl, ip, ip
+	shift1	lsr, r3, r3, r2
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 	mov	r3, r3, lsl #1
 	b	1b
 
@@ -1368,6 +1591,8 @@
 	mvns	r3, r2, asr #21
 	bne	5f			@ simple overflow
 	orrs	r3, xl, xh, lsl #12
+	/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne, tt
 	movne	r0, #0x7f000000
 	orrne	r0, r0, #0x00c00000
 	RETc(ne)			@ return NAN
@@ -1450,6 +1675,7 @@
 	fmdrr	d7, r2, r3
 	fcmpd	d6, d7
 	fmstat
+	do_it	ne, e
 	movne	r0, #0
 	moveq	r0, #1
 	RET
@@ -1466,6 +1692,7 @@
 	fmdrr	d7, r2, r3
 	fcmpd	d6, d7
 	fmstat
+	do_it	eq, e
 	moveq	r0, #0
 	movne	r0, #1
 	RET
@@ -1482,6 +1709,7 @@
 	fmdrr	d7, r2, r3
 	fcmpd	d6, d7
 	fmstat
+	do_it	pl, e
 	movpl	r0, #0
 	movmi	r0, #1
 	RET
@@ -1498,6 +1726,7 @@
 	fmdrr	d7, r2, r3
 	fcmpd	d6, d7
 	fmstat
+	do_it	le, e
 	movle	r0, #0
 	movgt	r0, #1
 	RET
@@ -1514,6 +1743,7 @@
 	fmdrr	d7, r2, r3
 	fcmpd	d6, d7
 	fmstat
+	do_it	hi, e
 	movhi	r0, #0
 	movls	r0, #1
 	RET
@@ -1530,6 +1760,7 @@
 	fmdrr	d7, r2, r3
 	fcmpd	d6, d7
 	fmstat
+	do_it	lt, e
 	movlt	r0, #0
 	movge	r0, #1
 	RET
@@ -1546,6 +1777,7 @@
 	fmdrr	d7, r2, r3
 	fcmpd	d6, d7
 	fmstat
+	do_it	vc, e
 	movvc	r0, #0
 	movvs	r0, #1
 	RET

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S Wed Jul 22 15:36:27 2009
@@ -74,36 +74,50 @@
 
 1:	@ Look for zeroes, equal values, INF, or NAN.
 	movs	r2, r0, lsl #1
-	movnes	r3, r1, lsl #1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne, ttt
+	COND(mov,s,ne)	r3, r1, lsl #1
 	teqne	r2, r3
-	mvnnes	ip, r2, asr #24
-	mvnnes	ip, r3, asr #24
+	COND(mvn,s,ne)	ip, r2, asr #24
+	COND(mvn,s,ne)	ip, r3, asr #24
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	beq	LSYM(Lad_s)
 
 	@ Compute exponent difference.  Make largest exponent in r2,
 	@ corresponding arg in r0, and positive exponent difference in r3.
 	mov	r2, r2, lsr #24
 	rsbs	r3, r2, r3, lsr #24
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	gt, ttt
 	addgt	r2, r2, r3
 	eorgt	r1, r0, r1
 	eorgt	r0, r1, r0
 	eorgt	r1, r0, r1
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	lt
 	rsblt	r3, r3, #0
 
 	@ If exponent difference is too large, return largest argument
 	@ already in r0.  We need up to 25 bit to handle proper rounding
 	@ of 0x1p25 - 1.1.
 	cmp	r3, #25
+	/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	hi
 	RETc(hi)
+	/* APPLE LOCAL end v7 support. Merge from mainline */
 
 	@ Convert mantissa to signed integer.
 	tst	r0, #0x80000000
 	orr	r0, r0, #0x00800000
 	bic	r0, r0, #0xff000000
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne
 	rsbne	r0, r0, #0
 	tst	r1, #0x80000000
 	orr	r1, r1, #0x00800000
 	bic	r1, r1, #0xff000000
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne
 	rsbne	r1, r1, #0
 
 	@ If exponent == difference, one or both args were denormalized.
@@ -117,15 +131,22 @@
 
 	@ Shift and add second arg to first arg in r0.
 	@ Keep leftover bits into r1.
-	adds	r0, r0, r1, asr r3
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shiftop adds, r0, r0, r1, asr, r3, ip
 	rsb	r3, r3, #32
-	mov	r1, r1, lsl r3
+	shift1	lsl, r1, r1, r3
 
 	@ Keep absolute value in r0-r1, sign in r3 (the n bit was set above)
 	and	r3, r0, #0x80000000
 	bpl	LSYM(Lad_p)
+#if defined(__thumb2__)
+	negs	r1, r1
+	sbc	r0, r0, r0, lsl #1
+#else
 	rsbs	r1, r1, #0
 	rsc	r0, r0, #0
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
 
 	@ Determine how to normalize the result.
 LSYM(Lad_p):
@@ -150,6 +171,8 @@
 LSYM(Lad_e):
 	cmp	r1, #0x80000000
 	adc	r0, r0, r2, lsl #23
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	biceq	r0, r0, #1
 	orr	r0, r0, r3
 	RET
@@ -188,16 +211,26 @@
 	clz	ip, r0
 	sub	ip, ip, #8
 	subs	r2, r2, ip
-	mov	r0, r0, lsl ip
+/* APPLE LOCAL v7 support. Merge from mainline */
+	shift1	lsl, r0, r0, ip
 
 #endif
 
 	@ Final result with sign
 	@ If exponent negative, denormalize result.
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ge, et
 	addge	r0, r0, r2, lsl #23
 	rsblt	r2, r2, #0
 	orrge	r0, r0, r3
+#if defined(__thumb2__)
+	do_it	lt, t
+	lsrlt	r0, r0, r2
+	orrlt	r0, r3, r0
+#else
 	orrlt	r0, r3, r0, lsr r2
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	RET
 
 	@ Fixup and adjust bit position for denormalized arguments.
@@ -205,6 +238,8 @@
 LSYM(Lad_d):
 	teq	r2, #0
 	eor	r1, r1, #0x00800000
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, te
 	eoreq	r0, r0, #0x00800000
 	addeq	r2, r2, #1
 	subne	r3, r3, #1
@@ -214,7 +249,10 @@
 	mov	r3, r1, lsl #1
 
 	mvns	ip, r2, asr #24
-	mvnnes	ip, r3, asr #24
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
+	COND(mvn,s,ne)	ip, r3, asr #24
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	beq	LSYM(Lad_i)
 
 	teq	r2, r3
@@ -222,12 +260,16 @@
 
 	@ Result is x + 0.0 = x or 0.0 + y = y.
 	teq	r2, #0
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	moveq	r0, r1
 	RET
 
 1:	teq	r0, r1
 
 	@ Result is x - x = 0.
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne, t
 	movne	r0, #0
 	RETc(ne)
 
@@ -235,9 +277,13 @@
 	tst	r2, #0xff000000
 	bne	2f
 	movs	r0, r0, lsl #1
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cs
 	orrcs	r0, r0, #0x80000000
 	RET
 2:	adds	r2, r2, #(2 << 24)
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cc, t
 	addcc	r0, r0, #(1 << 23)
 	RETc(cc)
 	and	r3, r0, #0x80000000
@@ -256,11 +302,15 @@
 	@   otherwise return r0 (which is INF or -INF)
 LSYM(Lad_i):
 	mvns	r2, r2, asr #24
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne, et
 	movne	r0, r1
-	mvneqs	r3, r3, asr #24
+	COND(mvn,s,eq)	r3, r3, asr #24
 	movne	r1, r0
 	movs	r2, r0, lsl #9
-	moveqs	r3, r1, lsl #9
+	do_it	eq, te
+	COND(mov,s,eq)	r3, r1, lsl #9
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	teqeq	r0, r1
 	orrne	r0, r0, #0x00400000	@ quiet NAN
 	RET
@@ -283,9 +333,13 @@
 ARM_FUNC_ALIAS(aeabi_i2f,floatsisf)
 	
 	ands	r3, r0, #0x80000000
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	mi
 	rsbmi	r0, r0, #0
 
 1:	movs	ip, r0
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	RETc(eq)
 
 	@ Add initial exponent to sign
@@ -310,7 +364,12 @@
 
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq, t
 	mvfeqs	f0, #0.0
+#else
+	do_it	eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
 #endif
 	RETc(eq)
 
@@ -323,14 +382,26 @@
 
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq, t
 	mvfeqs	f0, #0.0
+#else
+	do_it	eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
 #endif
 	RETc(eq)
 
 	ands	r3, ah, #0x80000000	@ sign bit in r3
 	bpl	1f
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	negs	al, al
+	sbc	ah, ah, ah, lsl #1
+#else
 	rsbs	al, al, #0
 	rsc	ah, ah, #0
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
 1:
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	@ For hard FPA code we want to return via the tail below so that
@@ -341,12 +412,16 @@
 #endif
 
 	movs	ip, ah
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, tt
 	moveq	ip, al
 	moveq	ah, al
 	moveq	al, #0
 
 	@ Add initial exponent to sign
 	orr	r3, r3, #((127 + 23 + 32) << 23)
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	subeq	r3, r3, #(32 << 23)
 2:	sub	r3, r3, #(1 << 23)
 
@@ -354,15 +429,23 @@
 
 	mov	r2, #23
 	cmp	ip, #(1 << 16)
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	hs, t
 	movhs	ip, ip, lsr #16
 	subhs	r2, r2, #16
 	cmp	ip, #(1 << 8)
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	hs, t
 	movhs	ip, ip, lsr #8
 	subhs	r2, r2, #8
 	cmp	ip, #(1 << 4)
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	hs, t
 	movhs	ip, ip, lsr #4
 	subhs	r2, r2, #4
 	cmp	ip, #(1 << 2)
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	hs, e
 	subhs	r2, r2, #2
 	sublo	r2, r2, ip, lsr #1
 	subs	r2, r2, ip, lsr #3
@@ -377,19 +460,23 @@
 	sub	r3, r3, r2, lsl #23
 	blt	3f
 
-	add	r3, r3, ah, lsl r2
-	mov	ip, al, lsl r2
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shiftop add, r3, r3, ah, lsl, r2, ip
+	shift1	lsl, ip, al, r2
 	rsb	r2, r2, #32
 	cmp	ip, #0x80000000
-	adc	r0, r3, al, lsr r2
+	shiftop adc, r0, r3, al, lsr, r2, r2
+	do_it	eq
 	biceq	r0, r0, #1
 	RET
 
 3:	add	r2, r2, #32
-	mov	ip, ah, lsl r2
+	shift1	lsl, ip, ah, r2
 	rsb	r2, r2, #32
 	orrs	al, al, ip, lsl #1
-	adc	r0, r3, ah, lsr r2
+	shiftop adc, r0, r3, ah, lsr, r2, r2
+	do_it	eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	biceq	r0, r0, ip, lsr #31
 	RET
 
@@ -418,7 +505,10 @@
 	@ Mask out exponents, trap any zero/denormal/INF/NAN.
 	mov	ip, #0xff
 	ands	r2, ip, r0, lsr #23
-	andnes	r3, ip, r1, lsr #23
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne, tt
+	COND(and,s,ne)	r3, ip, r1, lsr #23
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	teqne	r2, ip
 	teqne	r3, ip
 	beq	LSYM(Lml_s)
@@ -434,7 +524,10 @@
 	@ If power of two, branch to a separate path.
 	@ Make up for final alignment.
 	movs	r0, r0, lsl #9
-	movnes	r1, r1, lsl #9
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
+	COND(mov,s,ne)	r1, r1, lsl #9
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	beq	LSYM(Lml_1)
 	mov	r3, #0x08000000
 	orr	r0, r3, r0, lsr #5
@@ -446,7 +539,8 @@
 	and	r3, ip, #0x80000000
 
 	@ Well, no way to make it shorter without the umull instruction.
-	stmfd	sp!, {r3, r4, r5}
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_push	{r3, r4, r5}
 	mov	r4, r0, lsr #16
 	mov	r5, r1, lsr #16
 	bic	r0, r0, r4, lsl #16
@@ -457,7 +551,8 @@
 	mla	r0, r4, r1, r0
 	adds	r3, r3, r0, lsl #16
 	adc	r1, ip, r0, lsr #16
-	ldmfd	sp!, {r0, r4, r5}
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_pop	{r0, r4, r5}
 
 #else
 
@@ -471,6 +566,8 @@
 
 	@ Adjust result upon the MSB position.
 	cmp	r1, #(1 << 23)
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cc, tt
 	movcc	r1, r1, lsl #1
 	orrcc	r1, r1, r3, lsr #31
 	movcc	r3, r3, lsl #1
@@ -486,6 +583,8 @@
 	@ Round the result, merge final exponent.
 	cmp	r3, #0x80000000
 	adc	r0, r0, r2, lsl #23
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	biceq	r0, r0, #1
 	RET
 
@@ -493,11 +592,15 @@
 LSYM(Lml_1):
 	teq	r0, #0
 	and	ip, ip, #0x80000000
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	eq
 	moveq	r1, r1, lsl #9
 	orr	r0, ip, r0, lsr #9
 	orr	r0, r0, r1, lsr #9
 	subs	r2, r2, #127
-	rsbgts	r3, r2, #255
+	do_it	gt, tt
+	COND(rsb,s,gt)	r3, r2, #255
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	orrgt	r0, r0, r2, lsl #23
 	RETc(gt)
 
@@ -512,18 +615,22 @@
 
 	@ Check if denormalized result is possible, otherwise return signed 0.
 	cmn	r2, #(24 + 1)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	le, t
 	bicle	r0, r0, #0x7fffffff
 	RETc(le)
 
 	@ Shift value right, round, etc.
 	rsb	r2, r2, #0
 	movs	r1, r0, lsl #1
-	mov	r1, r1, lsr r2
+	shift1	lsr, r1, r1, r2
 	rsb	r2, r2, #32
-	mov	ip, r0, lsl r2
+	shift1	lsl, ip, r0, r2
 	movs	r0, r1, rrx
 	adc	r0, r0, #0
 	orrs	r3, r3, ip, lsl #1
+	do_it	eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	biceq	r0, r0, ip, lsr #31
 	RET
 
@@ -532,14 +639,18 @@
 LSYM(Lml_d):
 	teq	r2, #0
 	and	ip, r0, #0x80000000
-1:	moveq	r0, r0, lsl #1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+1:	do_it	eq, tt
+	moveq	r0, r0, lsl #1
 	tsteq	r0, #0x00800000
 	subeq	r2, r2, #1
 	beq	1b
 	orr	r0, r0, ip
 	teq	r3, #0
 	and	ip, r1, #0x80000000
-2:	moveq	r1, r1, lsl #1
+2:	do_it	eq, tt
+	moveq	r1, r1, lsl #1
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	tsteq	r1, #0x00800000
 	subeq	r3, r3, #1
 	beq	2b
@@ -550,12 +661,16 @@
 	@ Isolate the INF and NAN cases away
 	and	r3, ip, r1, lsr #23
 	teq	r2, ip
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
 	teqne	r3, ip
 	beq	1f
 
 	@ Here, one or more arguments are either denormalized or zero.
 	bics	ip, r0, #0x80000000
-	bicnes	ip, r1, #0x80000000
+	do_it	ne
+	COND(bic,s,ne)	ip, r1, #0x80000000
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	bne	LSYM(Lml_d)
 
 	@ Result is 0, but determine sign anyway.
@@ -566,6 +681,8 @@
 
 1:	@ One or both args are INF or NAN.
 	teq	r0, #0x0
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne, ett
 	teqne	r0, #0x80000000
 	moveq	r0, r1
 	teqne	r1, #0x0
@@ -578,6 +695,8 @@
 1:	teq	r3, ip
 	bne	LSYM(Lml_i)
 	movs	r3, r1, lsl #9
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ne
 	movne	r0, r1
 	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
 
@@ -608,7 +727,10 @@
 	@ Mask out exponents, trap any zero/denormal/INF/NAN.
 	mov	ip, #0xff
 	ands	r2, ip, r0, lsr #23
-	andnes	r3, ip, r1, lsr #23
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne, tt
+	COND(and,s,ne)	r3, ip, r1, lsr #23
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	teqne	r2, ip
 	teqne	r3, ip
 	beq	LSYM(Ldv_s)
@@ -635,25 +757,33 @@
 	@ Ensure result will land to known bit position.
 	@ Apply exponent bias accordingly.
 	cmp	r3, r1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	cc
 	movcc	r3, r3, lsl #1
 	adc	r2, r2, #(127 - 2)
 
 	@ The actual division loop.
 	mov	ip, #0x00800000
 1:	cmp	r3, r1
+	do_it	cs, t
 	subcs	r3, r3, r1
 	orrcs	r0, r0, ip
 	cmp	r3, r1, lsr #1
+	do_it	cs, t
 	subcs	r3, r3, r1, lsr #1
 	orrcs	r0, r0, ip, lsr #1
 	cmp	r3, r1, lsr #2
+	do_it	cs, t
 	subcs	r3, r3, r1, lsr #2
 	orrcs	r0, r0, ip, lsr #2
 	cmp	r3, r1, lsr #3
+	do_it	cs, t
 	subcs	r3, r3, r1, lsr #3
 	orrcs	r0, r0, ip, lsr #3
 	movs	r3, r3, lsl #4
-	movnes	ip, ip, lsr #4
+	do_it	ne
+	COND(mov,s,ne)	ip, ip, lsr #4
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	bne	1b
 
 	@ Check exponent for under/overflow.
@@ -663,6 +793,8 @@
 	@ Round the result, merge final exponent.
 	cmp	r3, r1
 	adc	r0, r0, r2, lsl #23
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq
 	biceq	r0, r0, #1
 	RET
 
@@ -671,7 +803,10 @@
 	and	ip, ip, #0x80000000
 	orr	r0, ip, r0, lsr #9
 	adds	r2, r2, #127
-	rsbgts	r3, r2, #255
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	gt, tt
+	COND(rsb,s,gt)	r3, r2, #255
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	orrgt	r0, r0, r2, lsl #23
 	RETc(gt)
 
@@ -685,14 +820,18 @@
 LSYM(Ldv_d):
 	teq	r2, #0
 	and	ip, r0, #0x80000000
-1:	moveq	r0, r0, lsl #1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+1:	do_it	eq, tt
+	moveq	r0, r0, lsl #1
 	tsteq	r0, #0x00800000
 	subeq	r2, r2, #1
 	beq	1b
 	orr	r0, r0, ip
 	teq	r3, #0
 	and	ip, r1, #0x80000000
-2:	moveq	r1, r1, lsl #1
+2:	do_it	eq, tt
+	moveq	r1, r1, lsl #1
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	tsteq	r1, #0x00800000
 	subeq	r3, r3, #1
 	beq	2b
@@ -718,7 +857,10 @@
 	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
 2:	@ If both are nonzero, we need to normalize and resume above.
 	bics	ip, r0, #0x80000000
-	bicnes	ip, r1, #0x80000000
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
+	COND(bic,s,ne)	ip, r1, #0x80000000
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	bne	LSYM(Ldv_d)
 	@ One or both arguments are zero.
 	bics	r2, r0, #0x80000000
@@ -774,18 +916,26 @@
 	mov	r2, r0, lsl #1
 	mov	r3, r1, lsl #1
 	mvns	ip, r2, asr #24
-	mvnnes	ip, r3, asr #24
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	do_it	ne
+	COND(mvn,s,ne)	ip, r3, asr #24
 	beq	3f
 
 	@ Compare values.
 	@ Note that 0.0 is equal to -0.0.
 2:	orrs	ip, r2, r3, lsr #1	@ test if both are 0, clear C flag
+	do_it	ne
 	teqne	r0, r1			@ if not 0 compare sign
-	subpls	r0, r2, r3		@ if same sign compare values, set r0
+	do_it	pl
+	COND(sub,s,pl)	r0, r2, r3		@ if same sign compare values, set r0
 
 	@ Result:
+	do_it	hi
 	movhi	r0, r1, asr #31
+	do_it	lo
 	mvnlo	r0, r1, asr #31
+	do_it	ne
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	orrne	r0, r0, #1
 	RET
 
@@ -822,13 +972,16 @@
 
 	@ The status-returning routines are required to preserve all
 	@ registers except ip, lr, and cpsr.
-6:	stmfd	sp!, {r0, r1, r2, r3, lr}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+6:  do_push {r0, r1, r2, r3, lr}
 	ARM_CALL cmpsf2
 	@ Set the Z flag correctly, and the C flag unconditionally.
-	cmp	 r0, #0
+	cmp r0, #0
 	@ Clear the C flag if the return value was -1, indicating
 	@ that the first operand was smaller than the second.
-	cmnmi	 r0, #0
+	do_it mi
+	cmnmi r0, #0
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	/* APPLE LOCAL ARM MACH assembler */
 	RETLDM1(r0, r1, r2, r3)
 
@@ -840,6 +993,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cfcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	eq, e
 	moveq	r0, #1	@ Equal to.
 	movne	r0, #0	@ Less than, greater than, or unordered.
 	RETLDM
@@ -850,6 +1005,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cfcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cc, e
 	movcc	r0, #1	@ Less than.
 	movcs	r0, #0	@ Equal to, greater than, or unordered.
 	RETLDM
@@ -860,6 +1017,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cfcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ls, e
 	movls	r0, #1  @ Less than or equal to.
 	movhi	r0, #0	@ Greater than or unordered.
 	RETLDM
@@ -870,6 +1029,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cfrcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	ls, e
 	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
 	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
 	RETLDM
@@ -880,6 +1041,8 @@
 
 	str	lr, [sp, #-8]!
 	ARM_CALL aeabi_cfrcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+	do_it	cc, e
 	movcc	r0, #1	@ Operand 2 is less than operand 1.
 	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
 			@ or they are unordered.
@@ -933,7 +1096,9 @@
 	mov	r3, r0, lsl #8
 	orr	r3, r3, #0x80000000
 	tst	r0, #0x80000000		@ the sign bit
-	mov	r0, r3, lsr r2
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+	shift1	lsr, r0, r3, r2
+	do_it	ne
 	rsbne	r0, r0, #0
 	RET
 
@@ -945,6 +1110,8 @@
 	movs	r2, r0, lsl #9
 	bne	4f			@ r0 is NAN.
 3:	ands	r0, r0, #0x80000000	@ the sign bit
+	do_it	eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	moveq	r0, #0x7fffffff		@ the maximum signed positive si
 	RET
 
@@ -974,7 +1141,8 @@
 	@ scale the value
 	mov	r3, r0, lsl #8
 	orr	r3, r3, #0x80000000
-	mov	r0, r3, lsr r2
+/* APPLE LOCAL v7 support. Merge from mainline */
+	shift1	lsr, r0, r3, r2
 	RET
 
 1:	mov	r0, #0
@@ -1062,6 +1230,7 @@
 	fmsr	s15, r1
 	fcmps	s14, s15
 	fmstat
+	do_it	ne, e
 	movne	r0, #0
 	moveq	r0, #1
 	RET
@@ -1078,6 +1247,7 @@
 	fmsr	s15, r1
 	fcmps	s14, s15
 	fmstat
+	do_it	eq, e
 	moveq	r0, #0
 	movne	r0, #1
 	RET
@@ -1094,6 +1264,7 @@
 	fmsr	s15, r1
 	fcmps	s14, s15
 	fmstat
+	do_it	pl, e
 	movpl	r0, #0
 	movmi	r0, #1
 	RET
@@ -1110,6 +1281,7 @@
 	fmsr	s15, r1
 	fcmps	s14, s15
 	fmstat
+	do_it	le, e
 	movle	r0, #0
 	movgt	r0, #1
 	RET
@@ -1126,6 +1298,7 @@
 	fmsr	s15, r1
 	fcmps	s14, s15
 	fmstat
+	do_it	hi, e
 	movhi	r0, #0
 	movls	r0, #1
 	RET
@@ -1142,6 +1315,7 @@
 	fmsr	s15, r1
 	fcmps	s14, s15
 	fmstat
+	do_it	lt, e
 	movlt	r0, #0
 	movge	r0, #1
 	RET
@@ -1158,6 +1332,7 @@
 	fmsr	s15, r1
 	fcmps	s14, s15
 	fmstat
+	do_it	vc, e
 	movvc	r0, #0
 	movvs	r0, #1
 	RET

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md Wed Jul 22 15:36:27 2009
@@ -1,3 +1,5 @@
+;; APPLE LOCAL v7 support. Merge from mainline
+;; ??? This file needs auditing for thumb2
 ;; Patterns for the Intel Wireless MMX technology architecture.
 ;; Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
 ;; Contributed by Red Hat.
@@ -19,6 +21,17 @@
 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 ;; Boston, MA 02110-1301, USA.
 
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+;; Integer element sizes implemented by IWMMXT.
+(define_mode_macro VMMX [V2SI V4HI V8QI])
+
+;; Integer element sizes for shifts.
+(define_mode_macro VSHFT [V4HI V2SI DI])
+
+;; Determine element size suffix from vector mode.
+(define_mode_attr MMX_char [(V8QI "b") (V4HI "h") (V2SI "w") (DI "d")])
+
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 (define_insn "iwmmxt_iordi3"
   [(set (match_operand:DI         0 "register_operand" "=y,?&r,?&r")
         (ior:DI (match_operand:DI 1 "register_operand" "%y,0,r")
@@ -157,9 +170,10 @@
    (set_attr "neg_pool_range" "*,*,4084,     *,*,*")]
 )
 
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
 (define_insn "movv8qi_internal"
-  [(set (match_operand:V8QI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r")
-	(match_operand:V8QI 1 "general_operand"       "y,y,mi,y,r,mi"))]
+  [(set (match_operand:V8QI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r,?r")
+	(match_operand:V8QI 1 "general_operand"       "y,y,mi,y,r,r,mi"))]
   "TARGET_REALLY_IWMMXT"
   "*
    switch (which_alternative)
@@ -169,17 +183,18 @@
    case 2: return \"wldrd%?\\t%0, %1\";
    case 3: return \"tmrrc%?\\t%Q0, %R0, %1\";
    case 4: return \"tmcrr%?\\t%0, %Q1, %R1\";
+   case 5: return \"#\";
    default: return output_move_double (operands);
    }"
   [(set_attr "predicable" "yes")
-   (set_attr "length"         "4,     4,   4,4,4,   8")
-   (set_attr "type"           "*,store1,load1,*,*,load1")
-   (set_attr "pool_range"     "*,     *, 256,*,*, 256")
-   (set_attr "neg_pool_range" "*,     *, 244,*,*, 244")])
+   (set_attr "length"         "4,     4,   4,4,4,8,   8")
+   (set_attr "type"           "*,store1,load1,*,*,*,load1")
+   (set_attr "pool_range"     "*,     *, 256,*,*,*, 256")
+   (set_attr "neg_pool_range" "*,     *, 244,*,*,*, 244")])
 
 (define_insn "movv4hi_internal"
-  [(set (match_operand:V4HI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r")
-	(match_operand:V4HI 1 "general_operand"       "y,y,mi,y,r,mi"))]
+  [(set (match_operand:V4HI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r,?r")
+	(match_operand:V4HI 1 "general_operand"       "y,y,mi,y,r,r,mi"))]
   "TARGET_REALLY_IWMMXT"
   "*
    switch (which_alternative)
@@ -189,17 +204,18 @@
    case 2: return \"wldrd%?\\t%0, %1\";
    case 3: return \"tmrrc%?\\t%Q0, %R0, %1\";
    case 4: return \"tmcrr%?\\t%0, %Q1, %R1\";
+   case 5: return \"#\";
    default: return output_move_double (operands);
    }"
   [(set_attr "predicable" "yes")
-   (set_attr "length"         "4,     4,   4,4,4,   8")
-   (set_attr "type"           "*,store1,load1,*,*,load1")
-   (set_attr "pool_range"     "*,     *, 256,*,*, 256")
-   (set_attr "neg_pool_range" "*,     *, 244,*,*, 244")])
+   (set_attr "length"         "4,     4,   4,4,4,8,   8")
+   (set_attr "type"           "*,store1,load1,*,*,*,load1")
+   (set_attr "pool_range"     "*,     *, 256,*,*,*, 256")
+   (set_attr "neg_pool_range" "*,     *, 244,*,*,*, 244")])
 
 (define_insn "movv2si_internal"
-  [(set (match_operand:V2SI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r")
-	(match_operand:V2SI 1 "general_operand"       "y,y,mi,y,r,mi"))]
+  [(set (match_operand:V2SI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r,?r")
+	(match_operand:V2SI 1 "general_operand"       "y,y,mi,y,r,r,mi"))]
   "TARGET_REALLY_IWMMXT"
   "*
    switch (which_alternative)
@@ -209,13 +225,15 @@
    case 2: return \"wldrd%?\\t%0, %1\";
    case 3: return \"tmrrc%?\\t%Q0, %R0, %1\";
    case 4: return \"tmcrr%?\\t%0, %Q1, %R1\";
+   case 5: return \"#\";
    default: return output_move_double (operands);
    }"
   [(set_attr "predicable" "yes")
-   (set_attr "length"         "4,     4,   4,4,4,  24")
-   (set_attr "type"           "*,store1,load1,*,*,load1")
-   (set_attr "pool_range"     "*,     *, 256,*,*, 256")
-   (set_attr "neg_pool_range" "*,     *, 244,*,*, 244")])
+   (set_attr "length"         "4,     4,   4,4,4,8,  24")
+   (set_attr "type"           "*,store1,load1,*,*,*,load1")
+   (set_attr "pool_range"     "*,     *, 256,*,*,*, 256")
+   (set_attr "neg_pool_range" "*,     *, 244,*,*,*, 244")])
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 
 ;; This pattern should not be needed.  It is to match a
 ;; wierd case generated by GCC when no optimizations are
@@ -235,30 +253,16 @@
 
 ;; Vector add/subtract
 
-(define_insn "addv8qi3"
-  [(set (match_operand:V8QI            0 "register_operand" "=y")
-        (plus:V8QI (match_operand:V8QI 1 "register_operand"  "y")
-	           (match_operand:V8QI 2 "register_operand"  "y")))]
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "*add<mode>3_iwmmxt"
+  [(set (match_operand:VMMX            0 "register_operand" "=y")
+        (plus:VMMX (match_operand:VMMX 1 "register_operand"  "y")
+	           (match_operand:VMMX 2 "register_operand"  "y")))]
   "TARGET_REALLY_IWMMXT"
-  "waddb%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "addv4hi3"
-  [(set (match_operand:V4HI            0 "register_operand" "=y")
-        (plus:V4HI (match_operand:V4HI 1 "register_operand"  "y")
-	           (match_operand:V4HI 2 "register_operand"  "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "waddh%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "addv2si3"
-  [(set (match_operand:V2SI            0 "register_operand" "=y")
-        (plus:V2SI (match_operand:V2SI 1 "register_operand"  "y")
-	           (match_operand:V2SI 2 "register_operand"  "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "waddw%?\\t%0, %1, %2"
+  "wadd<MMX_char>%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 (define_insn "ssaddv8qi3"
   [(set (match_operand:V8QI               0 "register_operand" "=y")
         (ss_plus:V8QI (match_operand:V8QI 1 "register_operand"  "y")
@@ -307,30 +311,16 @@
   "waddwus%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "subv8qi3"
-  [(set (match_operand:V8QI             0 "register_operand" "=y")
-        (minus:V8QI (match_operand:V8QI 1 "register_operand"  "y")
-		    (match_operand:V8QI 2 "register_operand"  "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsubb%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "subv4hi3"
-  [(set (match_operand:V4HI             0 "register_operand" "=y")
-        (minus:V4HI (match_operand:V4HI 1 "register_operand"  "y")
-		    (match_operand:V4HI 2 "register_operand"  "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsubh%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "subv2si3"
-  [(set (match_operand:V2SI             0 "register_operand" "=y")
-        (minus:V2SI (match_operand:V2SI 1 "register_operand"  "y")
-		    (match_operand:V2SI 2 "register_operand"  "y")))]
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "*sub<mode>3_iwmmxt"
+  [(set (match_operand:VMMX             0 "register_operand" "=y")
+        (minus:VMMX (match_operand:VMMX 1 "register_operand"  "y")
+		    (match_operand:VMMX 2 "register_operand"  "y")))]
   "TARGET_REALLY_IWMMXT"
-  "wsubw%?\\t%0, %1, %2"
+  "wsub<MMX_char>%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 (define_insn "sssubv8qi3"
   [(set (match_operand:V8QI                0 "register_operand" "=y")
         (ss_minus:V8QI (match_operand:V8QI 1 "register_operand"  "y")
@@ -379,7 +369,8 @@
   "wsubwus%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "mulv4hi3"
+;; APPLE LOCAL v7 support. Merge from Codesourcery
+(define_insn "*mulv4hi3_iwmmxt"
   [(set (match_operand:V4HI            0 "register_operand" "=y")
         (mult:V4HI (match_operand:V4HI 1 "register_operand" "y")
 		   (match_operand:V4HI 2 "register_operand" "y")))]
@@ -730,102 +721,40 @@
 
 ;; Max/min insns
 
-(define_insn "smaxv8qi3"
-  [(set (match_operand:V8QI            0 "register_operand" "=y")
-        (smax:V8QI (match_operand:V8QI 1 "register_operand" "y")
-		   (match_operand:V8QI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wmaxsb%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "umaxv8qi3"
-  [(set (match_operand:V8QI            0 "register_operand" "=y")
-        (umax:V8QI (match_operand:V8QI 1 "register_operand" "y")
-		   (match_operand:V8QI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wmaxub%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "smaxv4hi3"
-  [(set (match_operand:V4HI            0 "register_operand" "=y")
-        (smax:V4HI (match_operand:V4HI 1 "register_operand" "y")
-		   (match_operand:V4HI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wmaxsh%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "umaxv4hi3"
-  [(set (match_operand:V4HI            0 "register_operand" "=y")
-        (umax:V4HI (match_operand:V4HI 1 "register_operand" "y")
-		   (match_operand:V4HI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wmaxuh%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "smaxv2si3"
-  [(set (match_operand:V2SI            0 "register_operand" "=y")
-        (smax:V2SI (match_operand:V2SI 1 "register_operand" "y")
-		   (match_operand:V2SI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wmaxsw%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "umaxv2si3"
-  [(set (match_operand:V2SI            0 "register_operand" "=y")
-        (umax:V2SI (match_operand:V2SI 1 "register_operand" "y")
-		   (match_operand:V2SI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wmaxuw%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "sminv8qi3"
-  [(set (match_operand:V8QI            0 "register_operand" "=y")
-        (smin:V8QI (match_operand:V8QI 1 "register_operand" "y")
-		   (match_operand:V8QI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wminsb%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "uminv8qi3"
-  [(set (match_operand:V8QI            0 "register_operand" "=y")
-        (umin:V8QI (match_operand:V8QI 1 "register_operand" "y")
-		   (match_operand:V8QI 2 "register_operand" "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wminub%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "sminv4hi3"
-  [(set (match_operand:V4HI            0 "register_operand" "=y")
-        (smin:V4HI (match_operand:V4HI 1 "register_operand" "y")
-		   (match_operand:V4HI 2 "register_operand" "y")))]
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "*smax<mode>3_iwmmxt"
+  [(set (match_operand:VMMX            0 "register_operand" "=y")
+        (smax:VMMX (match_operand:VMMX 1 "register_operand" "y")
+		   (match_operand:VMMX 2 "register_operand" "y")))]
   "TARGET_REALLY_IWMMXT"
-  "wminsh%?\\t%0, %1, %2"
+  "wmaxs<MMX_char>%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "uminv4hi3"
-  [(set (match_operand:V4HI            0 "register_operand" "=y")
-        (umin:V4HI (match_operand:V4HI 1 "register_operand" "y")
-		   (match_operand:V4HI 2 "register_operand" "y")))]
+(define_insn "*umax<mode>3_iwmmxt"
+  [(set (match_operand:VMMX            0 "register_operand" "=y")
+        (umax:VMMX (match_operand:VMMX 1 "register_operand" "y")
+		   (match_operand:VMMX 2 "register_operand" "y")))]
   "TARGET_REALLY_IWMMXT"
-  "wminuh%?\\t%0, %1, %2"
+  "wmaxu<MMX_char>%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "sminv2si3"
-  [(set (match_operand:V2SI            0 "register_operand" "=y")
-        (smin:V2SI (match_operand:V2SI 1 "register_operand" "y")
-		   (match_operand:V2SI 2 "register_operand" "y")))]
+(define_insn "*smin<mode>3_iwmmxt"
+  [(set (match_operand:VMMX            0 "register_operand" "=y")
+        (smin:VMMX (match_operand:VMMX 1 "register_operand" "y")
+		   (match_operand:VMMX 2 "register_operand" "y")))]
   "TARGET_REALLY_IWMMXT"
-  "wminsw%?\\t%0, %1, %2"
+  "wmins<MMX_char>%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "uminv2si3"
-  [(set (match_operand:V2SI            0 "register_operand" "=y")
-        (umin:V2SI (match_operand:V2SI 1 "register_operand" "y")
-		   (match_operand:V2SI 2 "register_operand" "y")))]
+(define_insn "*umin<mode>3_iwmmxt"
+  [(set (match_operand:VMMX            0 "register_operand" "=y")
+        (umin:VMMX (match_operand:VMMX 1 "register_operand" "y")
+		   (match_operand:VMMX 2 "register_operand" "y")))]
   "TARGET_REALLY_IWMMXT"
-  "wminuw%?\\t%0, %1, %2"
+  "wminu<MMX_char>%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 ;; Pack/unpack insns.
 
 (define_insn "iwmmxt_wpackhss"
@@ -1137,77 +1066,31 @@
   "wrordg%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "ashrv4hi3"
-  [(set (match_operand:V4HI                0 "register_operand" "=y")
-        (ashiftrt:V4HI (match_operand:V4HI 1 "register_operand" "y")
-		       (match_operand:SI   2 "register_operand" "z")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsrahg%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "ashrv2si3"
-  [(set (match_operand:V2SI                0 "register_operand" "=y")
-        (ashiftrt:V2SI (match_operand:V2SI 1 "register_operand" "y")
-		       (match_operand:SI   2 "register_operand" "z")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsrawg%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "ashrdi3_iwmmxt"
-  [(set (match_operand:DI              0 "register_operand" "=y")
-	(ashiftrt:DI (match_operand:DI 1 "register_operand" "y")
-		   (match_operand:SI   2 "register_operand" "z")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsradg%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "lshrv4hi3"
-  [(set (match_operand:V4HI                0 "register_operand" "=y")
-        (lshiftrt:V4HI (match_operand:V4HI 1 "register_operand" "y")
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "ashr<mode>3_iwmmxt"
+  [(set (match_operand:VSHFT                 0 "register_operand" "=y")
+        (ashiftrt:VSHFT (match_operand:VSHFT 1 "register_operand" "y")
 		       (match_operand:SI   2 "register_operand" "z")))]
   "TARGET_REALLY_IWMMXT"
-  "wsrlhg%?\\t%0, %1, %2"
+  "wsra<MMX_char>g%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "lshrv2si3"
-  [(set (match_operand:V2SI                0 "register_operand" "=y")
-        (lshiftrt:V2SI (match_operand:V2SI 1 "register_operand" "y")
-		       (match_operand:SI   2 "register_operand" "z")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsrlwg%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "lshrdi3_iwmmxt"
-  [(set (match_operand:DI              0 "register_operand" "=y")
-	(lshiftrt:DI (match_operand:DI 1 "register_operand" "y")
+(define_insn "lshr<mode>3_iwmmxt"
+  [(set (match_operand:VSHFT                 0 "register_operand" "=y")
+        (lshiftrt:VSHFT (match_operand:VSHFT 1 "register_operand" "y")
 		     (match_operand:SI 2 "register_operand" "z")))]
   "TARGET_REALLY_IWMMXT"
-  "wsrldg%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "ashlv4hi3"
-  [(set (match_operand:V4HI              0 "register_operand" "=y")
-        (ashift:V4HI (match_operand:V4HI 1 "register_operand" "y")
-		     (match_operand:SI   2 "register_operand" "z")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsllhg%?\\t%0, %1, %2"
+  "wsrl<MMX_char>g%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
 
-(define_insn "ashlv2si3"
-  [(set (match_operand:V2SI              0 "register_operand" "=y")
-        (ashift:V2SI (match_operand:V2SI 1 "register_operand" "y")
-		       (match_operand:SI 2 "register_operand" "z")))]
-  "TARGET_REALLY_IWMMXT"
-  "wsllwg%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")])
-
-(define_insn "ashldi3_iwmmxt"
-  [(set (match_operand:DI            0 "register_operand" "=y")
-	(ashift:DI (match_operand:DI 1 "register_operand" "y")
+(define_insn "ashl<mode>3_iwmmxt"
+  [(set (match_operand:VSHFT               0 "register_operand" "=y")
+        (ashift:VSHFT (match_operand:VSHFT 1 "register_operand" "y")
 		   (match_operand:SI 2 "register_operand" "z")))]
   "TARGET_REALLY_IWMMXT"
-  "wslldg%?\\t%0, %1, %2"
+  "wsll<MMX_char>g%?\\t%0, %1, %2"
   [(set_attr "predicable" "yes")])
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 
 (define_insn "rorv4hi3_di"
   [(set (match_operand:V4HI                0 "register_operand" "=y")

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm Wed Jul 22 15:36:27 2009
@@ -1,7 +1,8 @@
 @ libgcc routines for ARM cpu.
 @ Division routines, written by Richard Earnshaw, (rearnsha at armltd.co.uk)
 
-/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+/* APPLE LOCAL v7 support. Merge from mainline */
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005, 2007
    Free Software Foundation, Inc.
 
 This file is free software; you can redistribute it and/or modify it
@@ -62,31 +63,32 @@
 
 /* Function end macros.  Variants for interworking.  */
 
-@ This selects the minimum architecture level required.
-#define __ARM_ARCH__ 3
-
+/* APPLE LOCAL begin v7 support. Merge from mainline */
 #if defined(__ARM_ARCH_3M__) || defined(__ARM_ARCH_4__) \
 	|| defined(__ARM_ARCH_4T__)
 /* We use __ARM_ARCH__ set to 4 here, but in reality it's any processor with
    long multiply instructions.  That includes v3M.  */
-# undef __ARM_ARCH__
 # define __ARM_ARCH__ 4
 #endif
 	
 #if defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \
 	|| defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
 	|| defined(__ARM_ARCH_5TEJ__)
-# undef __ARM_ARCH__
 # define __ARM_ARCH__ 5
 #endif
 
 #if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
 	|| defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
-	|| defined(__ARM_ARCH_6ZK__)
-# undef __ARM_ARCH__
+	|| defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)
 # define __ARM_ARCH__ 6
 #endif
 
+#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+	|| defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
+# define __ARM_ARCH__ 7
+#endif
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
 #ifndef __ARM_ARCH__
 #error Unable to determine architecture.
 #endif
@@ -184,12 +186,24 @@
 #define RETLDM \
 	ldr     lr, [sp], #8 ; \
 	bx      lr
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if definded (__thumb2__)
+#define RETLDM1(...) \
+	pop     {__VA_ARGS__, lr} ; \
+	bx      lr
+#define RETLDM2(cond,...) \
+	pop##cond       {__VA_ARGS__, lr} ; \
+	bx##cond        lr
+#else
 #define RETLDM1(...) \
 	ldmia   sp!, {__VA_ARGS__, lr} ; \
 	bx      lr
 #define RETLDM2(cond,...) \
 	ldm##cond##ia   sp!, {__VA_ARGS__, lr} ; \
 	bx##cond        lr
+#endif
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
 #define RETLDM_unwind(addr) \
 	ldr	lr, [sp], #8 ; \
 9:	cfi_pop	9b - addr, 0xe, 0x0 ; \
@@ -197,14 +211,82 @@
 #else
 #define RETLDM \
 	ldr     pc, [sp], #8
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined (__thumb2__)
+#define RETLDM1(...) \
+	pop   {__VA_ARGS__, pc}
+#define RETLDM2(cond,...) \
+	pop##cond   {__VA_ARGS__, pc}
+#else
 #define RETLDM1(...) \
 	ldmia   sp!, {__VA_ARGS__, pc}
 #define RETLDM2(cond,...) \
 	ldm##cond##ia   sp!, {__VA_ARGS__, pc}
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
 #define RETLDM_unwind(addr) \
 	ldr	pc, [sp], #8
 #endif
 
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+
+/* The Unified assembly syntax allows the same code to be assembled for both
+   ARM and Thumb-2.  However this is only supported by recent gas, so define
+   a set of macros to allow ARM code on older assemblers.  */
+#if defined(__thumb2__)
+.macro do_it cond, suffix=""
+#if defined (__MACH__)
+	it$1	$0
+#else
+        it\suffix       \cond
+#endif
+.endm
+.macro shift1 op, arg0, arg1, arg2
+#if defined (__MACH__)
+        $0      $1, $2, $3
+#else
+        \op     \arg0, \arg1, \arg2
+#endif
+.endm   
+#define do_push push
+#define do_pop  pop
+#define COND(op1, op2, cond) op1 ## op2 ## cond
+/* Perform an arithmetic operation with a variable shift operand.  This
+   requires two instructions and a scratch register on Thumb-2.  */
+.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp
+#if defined (__MACH__)
+        $4  $6, $3, $5
+        $0  $1, $2, $6
+#else
+        \shiftop \tmp, \src2, \shiftreg
+        \name \dest, \src1, \tmp
+#endif
+.endm   
+#else
+.macro do_it cond, suffix=""
+.endm   
+.macro shift1 op, arg0, arg1, arg2
+#if defined (__MACH__)
+        mov     $1, $2, $0 $3
+#else
+        mov     \arg0, \arg1, \op \arg2
+#endif
+.endm
+#define do_push stmfd sp!,
+#define do_pop  ldmfd sp!,
+#define COND(op1, op2, cond) op1 ## cond ## op2
+.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp
+#if defined (__MACH__)
+        $0 $1, $2, $3, $4 $5
+#else
+        \name \dest, \src1, \src2, \shiftop \shiftreg
+#endif
+.endm
+#endif
+
+
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
 .macro ARM_LDIV0 name
 	str	lr, [sp, #-8]!
 #if !defined(__MACH__)
@@ -277,6 +359,13 @@
 #ifdef __thumb__
 #define THUMB_FUNC .thumb_func
 #define THUMB_CODE .force_thumb
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+# if defined(__thumb2__)
+#define THUMB_SYNTAX .syntax divided
+# else
+#define THUMB_SYNTAX
+# endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
 /* APPLE LOCAL ARM function alignment */
 #define FUNC_ALIGN .align 1
 #else
@@ -284,6 +373,8 @@
 #define THUMB_CODE
 /* APPLE LOCAL ARM function alignment */
 #define FUNC_ALIGN .align 2
+/* APPLE LOCAL v7 support. Merge from mainline */
+#define THUMB_SYNTAX
 #endif
 	
 /* APPLE LOCAL begin ARM MACH assembler */
@@ -310,10 +401,33 @@
 /* Special function that will always be coded in ARM assembly, even if
    in Thumb-only compilation.  */
 
-#if defined(__INTERWORKING_STUBS__)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+
+/* For Thumb-2 we build everything in thumb mode.  */
+.macro ARM_FUNC_START name
+#if defined(__MACH__)
+	FUNC_START $0
+#else
+       FUNC_START \name
+#endif
+       .syntax unified
+.endm
+#define EQUIV .thumb_set
+.macro  ARM_CALL name
+#if defined(__MACH__)
+  bl ___$0
+#else
+  bl  ___\name
+#endif
+.endm
+
+#elif defined(__INTERWORKING_STUBS__)
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
 .macro	ARM_FUNC_START name
 
-#if defined(__MACH)
+#if defined(__MACH__)
 	FUNC_START $0
 #else
 	FUNC_START \name
@@ -339,7 +453,11 @@
 	bl	_L__\name
 #endif
 .endm
-#else
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+
+#else /* !(__INTERWORKING_STUBS__ || __thumb2__) */
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
 .macro	ARM_FUNC_START name
 #if defined(__MACH__)
 	.text
@@ -845,7 +963,7 @@
 	cmp	dividend, divisor
 	blo	LSYM(Lgot_result)
 
-	/* LLVM LOCAL */
+	/* APPLE LOCAL v7 support */
 	THUMB_DIV_MOD_BODY(0)
 	
 	mov	r0, result
@@ -889,7 +1007,8 @@
 #ifdef __thumb__
 	push	{r0, r1, lr}
 	bl	SYM(__udivsi3)
-	POP	{r1, r2, r3}
+	/* APPLE LOCAL v7 support */
+	pop	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
@@ -921,7 +1040,7 @@
 LSYM(Lover10):
 	push	{ work }
 
-	/* LLVM LOCAL */
+	/* APPLE LOCAL v7 support */
 	THUMB_DIV_MOD_BODY(1)
 	
 	pop	{ work }
@@ -975,7 +1094,7 @@
 	cmp	dividend, divisor
 	blo	LSYM(Lgot_result)
 
-	/* LLVM LOCAL */
+	/* APPLE LOCAL v7 support */
 	THUMB_DIV_MOD_BODY(0)
 	
 	mov	r0, result
@@ -1039,7 +1158,8 @@
 #ifdef __thumb__
 	push	{r0, r1, lr}
 	bl	SYM(__divsi3)
-	POP	{r1, r2, r3}
+	/* APPLE LOCAL v7 support */
+	pop	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
@@ -1079,7 +1199,7 @@
 	cmp	dividend, divisor
 	blo	LSYM(Lgot_result)
 
-	/* LLVM LOCAL */
+	/* APPLE LOCAL v7 support */
 	THUMB_DIV_MOD_BODY(1)
 		
 	pop	{ work }
@@ -1279,6 +1399,10 @@
 
 /* APPLE LOCAL begin ARM 4790140 compact switch tables */
 /* ----------------------------------------------------------------------- */
+/* These aren't needed for Thumb2 since then we have actual instructions
+   to do what these functions do. */
+#ifndef __thumb2__
+
 /* Thumb switch table implementation.  Arm code, although must be called 
    from Thumb (the low bit of LR is expected to be 1).
    Expects the call site to be followed by 1-byte count, then <count>
@@ -1354,7 +1478,6 @@
 
 	FUNC_END switch32
 #endif
-/* APPLE LOCAL end ARM 4790140 compact switch tables */
 
 /* APPLE LOCAL begin 6465387 exception handling interworking VFP save */
 #if (__ARM_ARCH__ == 6)
@@ -1374,6 +1497,9 @@
 #endif
 /* APPLE LOCAL end 6465387 exception handling interworking VFP save */
 
+#endif /* !defined (__thumb2__) */
+/* APPLE LOCAL end ARM 4790140 compact switch tables */
+
 #endif /* __symbian__ */
 
 /* ------------------------------------------------------------------------ */
@@ -1427,6 +1553,12 @@
 
 #endif /* L_call_via_rX */
 
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Don't bother with the old interworking routines for Thumb-2.  */
+/* ??? Maybe only omit these on v7m.  */
+#ifndef __thumb2__
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
 #if defined L_interwork_call_via_rX
 
 /* These labels & instructions are used by the Arm/Thumb interworking code,
@@ -1552,6 +1684,8 @@
 	SIZE	(_interwork_call_via_lr)
 	
 #endif /* L_interwork_call_via_rX */
+/* APPLE LOCAL v7 support. Merge from mainline */
+#endif /* !__thumb2__ */
 #endif /* Arch supports thumb.  */
 
 #ifndef __symbian__

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S Wed Jul 22 15:36:27 2009
@@ -43,7 +43,16 @@
 	   this.  */
 	add r1, r0, #52
 	ldmia r1, {r3, r4, r5}  /* {sp, lr, pc}.  */
-#ifdef __INTERWORKING__
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	/* Thumb-2 doesn't allow sp in a load-multiple instruction, so push
+	   the target address onto the target stack.  This is safe as
+	   we're always returning to somewhere further up the call stack.  */
+	mov ip, r3
+	mov lr, r4
+	str r5, [ip, #-4]!
+#elif defined(__INTERWORKING__)
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	/* Restore pc into ip.  */
 	mov r2, r5
 	stmfd sp!, {r2, r3, r4}
@@ -52,8 +61,14 @@
 #endif
 	/* Don't bother restoring ip.  */
 	ldmia r0, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	/* Pop the return address off the target stack.  */
+	mov sp, ip
+	pop {pc}
+#elif defined(__INTERWORKING__)
 	/* Pop the three registers we pushed earlier.  */
-#ifdef __INTERWORKING__
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	ldmfd sp, {ip, sp, lr}
 	bx ip
 #else
@@ -62,27 +77,129 @@
 	FUNC_END restore_core_regs
 	UNPREFIX restore_core_regs
 
-/* Load VFP registers d0-d15 from the address in r0.  */
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Load VFP registers d0-d15 from the address in r0.
+   Use this to load from FSTMX format.  */
+/* APPLE LOCAL end v7 support. Merge from mainline */
 ARM_FUNC_START gnu_Unwind_Restore_VFP
 	/* Use the generic coprocessor form so that gas doesn't complain
 	   on soft-float targets.  */
 	ldc   p11,cr0,[r0],{0x21} /* fldmiax r0, {d0-d15} */
 	RET
 
-/* Store VFR regsters d0-d15 to the address in r0.  */
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Store VFP registers d0-d15 to the address in r0.
+   Use this to store in FSTMX format.  */
+/* APPLE LOCAL end v7 support. Merge from mainline */
 ARM_FUNC_START gnu_Unwind_Save_VFP
 	/* Use the generic coprocessor form so that gas doesn't complain
 	   on soft-float targets.  */
 	stc   p11,cr0,[r0],{0x21} /* fstmiax r0, {d0-d15} */
 	RET
 
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Load VFP registers d0-d15 from the address in r0.
+   Use this to load from FSTMD format.  */
+ARM_FUNC_START gnu_Unwind_Restore_VFP_D
+	ldc   p11,cr0,[r0],{0x20} /* fldmiad r0, {d0-d15} */
+	RET
+
+/* Store VFP registers d0-d15 to the address in r0.
+   Use this to store in FLDMD format.  */
+ARM_FUNC_START gnu_Unwind_Save_VFP_D
+	stc   p11,cr0,[r0],{0x20} /* fstmiad r0, {d0-d15} */
+	RET
+
+/* Load VFP registers d16-d31 from the address in r0.
+   Use this to load from FSTMD (=VSTM) format.  Needs VFPv3.  */
+ARM_FUNC_START gnu_Unwind_Restore_VFP_D_16_to_31
+	ldcl  p11,cr0,[r0],{0x20} /* vldm r0, {d16-d31} */
+	RET
+
+/* Store VFP registers d16-d31 to the address in r0.
+   Use this to store in FLDMD (=VLDM) format.  Needs VFPv3.  */
+ARM_FUNC_START gnu_Unwind_Save_VFP_D_16_to_31
+	stcl  p11,cr0,[r0],{0x20} /* vstm r0, {d16-d31} */
+	RET
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
+/* APPLE LOCAL begin v7 support. Merge from Codesourcery */
+ARM_FUNC_START gnu_Unwind_Restore_WMMXD
+	/* Use the generic coprocessor form so that gas doesn't complain
+	   on non-iWMMXt targets.  */
+	ldcl  p1, cr0, [r0], #8 /* wldrd wr0, [r0], #8 */
+	ldcl  p1, cr1, [r0], #8 /* wldrd wr1, [r0], #8 */
+	ldcl  p1, cr2, [r0], #8 /* wldrd wr2, [r0], #8 */
+	ldcl  p1, cr3, [r0], #8 /* wldrd wr3, [r0], #8 */
+	ldcl  p1, cr4, [r0], #8 /* wldrd wr4, [r0], #8 */
+	ldcl  p1, cr5, [r0], #8 /* wldrd wr5, [r0], #8 */
+	ldcl  p1, cr6, [r0], #8 /* wldrd wr6, [r0], #8 */
+	ldcl  p1, cr7, [r0], #8 /* wldrd wr7, [r0], #8 */
+	ldcl  p1, cr8, [r0], #8 /* wldrd wr8, [r0], #8 */
+	ldcl  p1, cr9, [r0], #8 /* wldrd wr9, [r0], #8 */
+	ldcl  p1, cr10, [r0], #8 /* wldrd wr10, [r0], #8 */
+	ldcl  p1, cr11, [r0], #8 /* wldrd wr11, [r0], #8 */
+	ldcl  p1, cr12, [r0], #8 /* wldrd wr12, [r0], #8 */
+	ldcl  p1, cr13, [r0], #8 /* wldrd wr13, [r0], #8 */
+	ldcl  p1, cr14, [r0], #8 /* wldrd wr14, [r0], #8 */
+	ldcl  p1, cr15, [r0], #8 /* wldrd wr15, [r0], #8 */
+	RET
+
+ARM_FUNC_START gnu_Unwind_Save_WMMXD
+	/* Use the generic coprocessor form so that gas doesn't complain
+	   on non-iWMMXt targets.  */
+	stcl  p1, cr0, [r0], #8 /* wstrd wr0, [r0], #8 */
+	stcl  p1, cr1, [r0], #8 /* wstrd wr1, [r0], #8 */
+	stcl  p1, cr2, [r0], #8 /* wstrd wr2, [r0], #8 */
+	stcl  p1, cr3, [r0], #8 /* wstrd wr3, [r0], #8 */
+	stcl  p1, cr4, [r0], #8 /* wstrd wr4, [r0], #8 */
+	stcl  p1, cr5, [r0], #8 /* wstrd wr5, [r0], #8 */
+	stcl  p1, cr6, [r0], #8 /* wstrd wr6, [r0], #8 */
+	stcl  p1, cr7, [r0], #8 /* wstrd wr7, [r0], #8 */
+	stcl  p1, cr8, [r0], #8 /* wstrd wr8, [r0], #8 */
+	stcl  p1, cr9, [r0], #8 /* wstrd wr9, [r0], #8 */
+	stcl  p1, cr10, [r0], #8 /* wstrd wr10, [r0], #8 */
+	stcl  p1, cr11, [r0], #8 /* wstrd wr11, [r0], #8 */
+	stcl  p1, cr12, [r0], #8 /* wstrd wr12, [r0], #8 */
+	stcl  p1, cr13, [r0], #8 /* wstrd wr13, [r0], #8 */
+	stcl  p1, cr14, [r0], #8 /* wstrd wr14, [r0], #8 */
+	stcl  p1, cr15, [r0], #8 /* wstrd wr15, [r0], #8 */
+	RET
+
+ARM_FUNC_START gnu_Unwind_Restore_WMMXC
+	/* Use the generic coprocessor form so that gas doesn't complain
+	   on non-iWMMXt targets.  */
+	ldc2  p1, cr8, [r0], #4 /* wldrw wcgr0, [r0], #4 */
+	ldc2  p1, cr9, [r0], #4 /* wldrw wcgr1, [r0], #4 */
+	ldc2  p1, cr10, [r0], #4 /* wldrw wcgr2, [r0], #4 */
+	ldc2  p1, cr11, [r0], #4 /* wldrw wcgr3, [r0], #4 */
+	RET
+
+ARM_FUNC_START gnu_Unwind_Save_WMMXC
+	/* Use the generic coprocessor form so that gas doesn't complain
+	   on non-iWMMXt targets.  */
+	stc2  p1, cr8, [r0], #4 /* wstrw wcgr0, [r0], #4 */
+	stc2  p1, cr9, [r0], #4 /* wstrw wcgr1, [r0], #4 */
+	stc2  p1, cr10, [r0], #4 /* wstrw wcgr2, [r0], #4 */
+	stc2  p1, cr11, [r0], #4 /* wstrw wcgr3, [r0], #4 */
+	RET
+
+/* APPLE LOCAL end v7 support. Merge from Codesourcery */
 /* Wrappers to save core registers, then call the real routine.   */
 
 .macro  UNWIND_WRAPPER name nargs
 	ARM_FUNC_START \name
 	/* Create a phase2_vrs structure.  */
 	/* Split reg push in two to ensure the correct value for sp.  */
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+	mov ip, sp
+	push {lr} /* PC is ignored.  */
+	push {ip, lr} /* Push original SP and LR.  */
+#else
 	stmfd sp!, {sp, lr, pc}
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	stmfd sp!, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip}
 	
 	/* Demand-save flags, plus an extra word for alignment.  */
@@ -91,7 +208,8 @@
 
 	/* Point r1 at the block.  Pass r[0..nargs) unchanged.  */
 	add r\nargs, sp, #4
-#if defined(__thumb__)
+/* APPLE LOCAL v7 support. Merge from mainline */
+#if defined(__thumb__) && !defined(__thumb2__)
 	/* Switch back to thumb mode to avoid interworking hassle.  */
 	adr ip, .L1_\name
 	orr ip, ip, #1

Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,323 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* ARM NEON documentation generator.
+
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 2, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to the Free
+   Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.  *)
+
+open Neon
+
+(* The combined "ops" and "reinterp" table.  *)
+let ops_reinterp = reinterp @ ops
+
+(* Helper functions for extracting things from the "ops" table.  *)
+let single_opcode desired_opcode () =
+  List.fold_left (fun got_so_far ->
+                  fun row ->
+                    match row with
+                      (opcode, _, _, _, _, _) ->
+                        if opcode = desired_opcode then row :: got_so_far
+                                                   else got_so_far
+                 ) [] ops_reinterp
+
+let multiple_opcodes desired_opcodes () =
+  List.fold_left (fun got_so_far ->
+                  fun desired_opcode ->
+                    (single_opcode desired_opcode ()) @ got_so_far)
+                 [] desired_opcodes
+
+let ldx_opcode number () =
+  List.fold_left (fun got_so_far ->
+                  fun row ->
+                    match row with
+                      (opcode, _, _, _, _, _) ->
+                        match opcode with
+                          Vldx n | Vldx_lane n | Vldx_dup n when n = number ->
+                            row :: got_so_far
+                          | _ -> got_so_far
+                 ) [] ops_reinterp
+
+let stx_opcode number () =
+  List.fold_left (fun got_so_far ->
+                  fun row ->
+                    match row with
+                      (opcode, _, _, _, _, _) ->
+                        match opcode with
+                          Vstx n | Vstx_lane n when n = number ->
+                            row :: got_so_far
+                          | _ -> got_so_far
+                 ) [] ops_reinterp
+
+let tbl_opcode () =
+  List.fold_left (fun got_so_far ->
+                  fun row ->
+                    match row with
+                      (opcode, _, _, _, _, _) ->
+                        match opcode with
+                          Vtbl _ -> row :: got_so_far
+                          | _ -> got_so_far
+                 ) [] ops_reinterp
+
+let tbx_opcode () =
+  List.fold_left (fun got_so_far ->
+                  fun row ->
+                    match row with
+                      (opcode, _, _, _, _, _) ->
+                        match opcode with
+                          Vtbx _ -> row :: got_so_far
+                          | _ -> got_so_far
+                 ) [] ops_reinterp
+
+(* The groups of intrinsics.  *)
+let intrinsic_groups =
+  [ "Addition", single_opcode Vadd;
+    "Multiplication", single_opcode Vmul;
+    "Multiply-accumulate", single_opcode Vmla;
+    "Multiply-subtract", single_opcode Vmls;
+    "Subtraction", single_opcode Vsub;
+    "Comparison (equal-to)", single_opcode Vceq;
+    "Comparison (greater-than-or-equal-to)", single_opcode Vcge;
+    "Comparison (less-than-or-equal-to)", single_opcode Vcle;
+    "Comparison (greater-than)", single_opcode Vcgt;
+    "Comparison (less-than)", single_opcode Vclt;
+    "Comparison (absolute greater-than-or-equal-to)", single_opcode Vcage;
+    "Comparison (absolute less-than-or-equal-to)", single_opcode Vcale;
+    "Comparison (absolute greater-than)", single_opcode Vcagt;
+    "Comparison (absolute less-than)", single_opcode Vcalt;
+    "Test bits", single_opcode Vtst;
+    "Absolute difference", single_opcode Vabd;
+    "Absolute difference and accumulate", single_opcode Vaba;
+    "Maximum", single_opcode Vmax;
+    "Minimum", single_opcode Vmin;
+    "Pairwise add", single_opcode Vpadd;
+    "Pairwise add, single_opcode widen and accumulate", single_opcode Vpada;
+    "Folding maximum", single_opcode Vpmax;
+    "Folding minimum", single_opcode Vpmin;
+    "Reciprocal step", multiple_opcodes [Vrecps; Vrsqrts];
+    "Vector shift left", single_opcode Vshl;
+    "Vector shift left by constant", single_opcode Vshl_n;
+    "Vector shift right by constant", single_opcode Vshr_n;
+    "Vector shift right by constant and accumulate", single_opcode Vsra_n;
+    "Vector shift right and insert", single_opcode Vsri;
+    "Vector shift left and insert", single_opcode Vsli;
+    "Absolute value", single_opcode Vabs;
+    "Negation", single_opcode Vneg;
+    "Bitwise not", single_opcode Vmvn;
+    "Count leading sign bits", single_opcode Vcls;
+    "Count leading zeros", single_opcode Vclz;
+    "Count number of set bits", single_opcode Vcnt;
+    "Reciprocal estimate", single_opcode Vrecpe;
+    "Reciprocal square-root estimate", single_opcode Vrsqrte;
+    "Get lanes from a vector", single_opcode Vget_lane;
+    "Set lanes in a vector", single_opcode Vset_lane;
+    "Create vector from literal bit pattern", single_opcode Vcreate;
+    "Set all lanes to the same value",
+      multiple_opcodes [Vdup_n; Vmov_n; Vdup_lane];
+    "Combining vectors", single_opcode Vcombine;
+    "Splitting vectors", multiple_opcodes [Vget_high; Vget_low];
+    "Conversions", multiple_opcodes [Vcvt; Vcvt_n];
+    "Move, single_opcode narrowing", single_opcode Vmovn;
+    "Move, single_opcode long", single_opcode Vmovl;
+    "Table lookup", tbl_opcode;
+    "Extended table lookup", tbx_opcode;
+    "Multiply, lane", single_opcode Vmul_lane;
+    "Long multiply, lane", single_opcode Vmull_lane;
+    "Saturating doubling long multiply, lane", single_opcode Vqdmull_lane;
+    "Saturating doubling multiply high, lane", single_opcode Vqdmulh_lane;
+    "Multiply-accumulate, lane", single_opcode Vmla_lane;
+    "Multiply-subtract, lane", single_opcode Vmls_lane;
+    "Vector multiply by scalar", single_opcode Vmul_n;
+    "Vector long multiply by scalar", single_opcode Vmull_n;
+    "Vector saturating doubling long multiply by scalar",
+      single_opcode Vqdmull_n;
+    "Vector saturating doubling multiply high by scalar",
+      single_opcode Vqdmulh_n;
+    "Vector multiply-accumulate by scalar", single_opcode Vmla_n;
+    "Vector multiply-subtract by scalar", single_opcode Vmls_n;
+    "Vector extract", single_opcode Vext;
+    "Reverse elements", multiple_opcodes [Vrev64; Vrev32; Vrev16];
+    "Bit selection", single_opcode Vbsl;
+    "Transpose elements", single_opcode Vtrn;
+    "Zip elements", single_opcode Vzip;
+    "Unzip elements", single_opcode Vuzp;
+    "Element/structure loads, VLD1 variants", ldx_opcode 1;
+    "Element/structure stores, VST1 variants", stx_opcode 1;
+    "Element/structure loads, VLD2 variants", ldx_opcode 2;
+    "Element/structure stores, VST2 variants", stx_opcode 2;
+    "Element/structure loads, VLD3 variants", ldx_opcode 3;
+    "Element/structure stores, VST3 variants", stx_opcode 3;
+    "Element/structure loads, VLD4 variants", ldx_opcode 4;
+    "Element/structure stores, VST4 variants", stx_opcode 4;
+    "Logical operations (AND)", single_opcode Vand;
+    "Logical operations (OR)", single_opcode Vorr;
+    "Logical operations (exclusive OR)", single_opcode Veor;
+    "Logical operations (AND-NOT)", single_opcode Vbic;
+    "Logical operations (OR-NOT)", single_opcode Vorn;
+    "Reinterpret casts", single_opcode Vreinterp ]
+
+(* Given an intrinsic shape, produce a string to document the corresponding
+   operand shapes.  *)
+let rec analyze_shape shape =
+  let rec n_things n thing =
+    match n with
+      0 -> []
+    | n -> thing :: (n_things (n - 1) thing)
+  in
+  let rec analyze_shape_elt reg_no elt =
+    match elt with
+      Dreg -> "@var{d" ^ (string_of_int reg_no) ^ "}"
+    | Qreg -> "@var{q" ^ (string_of_int reg_no) ^ "}"
+    | Corereg -> "@var{r" ^ (string_of_int reg_no) ^ "}"
+    | Immed -> "#@var{0}"
+    | VecArray (1, elt) ->
+        let elt_regexp = analyze_shape_elt 0 elt in
+          "@{" ^ elt_regexp ^ "@}"
+    | VecArray (n, elt) ->
+      let rec f m =
+        match m with
+          0 -> []
+        | m -> (analyze_shape_elt (m - 1) elt) :: (f (m - 1))
+      in
+      let ops = List.rev (f n) in
+        "@{" ^ (commas (fun x -> x) ops "") ^ "@}"
+    | (PtrTo elt | CstPtrTo elt) ->
+      "[" ^ (analyze_shape_elt reg_no elt) ^ "]"
+    | Element_of_dreg -> (analyze_shape_elt reg_no Dreg) ^ "[@var{0}]"
+    | Element_of_qreg -> (analyze_shape_elt reg_no Qreg) ^ "[@var{0}]"
+    | All_elements_of_dreg -> (analyze_shape_elt reg_no Dreg) ^ "[]"
+  in
+    match shape with
+      All (n, elt) -> commas (analyze_shape_elt 0) (n_things n elt) ""
+    | Long -> (analyze_shape_elt 0 Qreg) ^ ", " ^ (analyze_shape_elt 0 Dreg) ^
+              ", " ^ (analyze_shape_elt 0 Dreg)
+    | Long_noreg elt -> (analyze_shape_elt 0 elt) ^ ", " ^
+              (analyze_shape_elt 0 elt)
+    | Wide -> (analyze_shape_elt 0 Qreg) ^ ", " ^ (analyze_shape_elt 0 Qreg) ^
+              ", " ^ (analyze_shape_elt 0 Dreg)
+    | Wide_noreg elt -> analyze_shape (Long_noreg elt)
+    | Narrow -> (analyze_shape_elt 0 Dreg) ^ ", " ^ (analyze_shape_elt 0 Qreg) ^
+                ", " ^ (analyze_shape_elt 0 Qreg)
+    | Use_operands elts -> commas (analyze_shape_elt 0) (Array.to_list elts) ""
+    | By_scalar Dreg ->
+        analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |])
+    | By_scalar Qreg ->
+        analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |])
+    | By_scalar _ -> assert false
+    | Wide_lane ->
+        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+    | Wide_scalar ->
+        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+    | Pair_result elt ->
+      let elt_regexp = analyze_shape_elt 0 elt in
+      let elt_regexp' = analyze_shape_elt 1 elt in
+        elt_regexp ^ ", " ^ elt_regexp'
+    | Unary_scalar _ -> "FIXME Unary_scalar"
+    | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |])
+    | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |])
+    | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |])
+
+(* Document a single intrinsic.  *)
+let describe_intrinsic first chan
+                       (elt_ty, (_, features, shape, name, munge, _)) =
+  let c_arity, new_elt_ty = munge shape elt_ty in
+  let c_types = strings_of_arity c_arity in
+  Printf.fprintf chan "@itemize @bullet\n";
+  let item_code = if first then "@item" else "@itemx" in
+    Printf.fprintf chan "%s %s %s_%s (" item_code (List.hd c_types)
+                   (intrinsic_name name) (string_of_elt elt_ty);
+    Printf.fprintf chan "%s)\n" (commas (fun ty -> ty) (List.tl c_types) "");
+    if not (List.exists (fun feature -> feature = No_op) features) then
+    begin
+      let print_one_insn name =
+        Printf.fprintf chan "@code{";
+        let no_suffix = (new_elt_ty = NoElts) in
+        let name_with_suffix =
+          if no_suffix then name
+          else name ^ "." ^ (string_of_elt_dots new_elt_ty)
+        in
+        let possible_operands = analyze_all_shapes features shape
+                                                   analyze_shape
+        in
+	let rec print_one_possible_operand op =
+	  Printf.fprintf chan "%s %s}" name_with_suffix op
+        in
+          (* If the intrinsic expands to multiple instructions, we assume
+             they are all of the same form.  *)
+          print_one_possible_operand (List.hd possible_operands)
+      in
+      let rec print_insns names =
+        match names with
+          [] -> ()
+        | [name] -> print_one_insn name
+        | name::names -> (print_one_insn name;
+                          Printf.fprintf chan " @emph{or} ";
+                          print_insns names)
+      in
+      let insn_names = get_insn_names features name in
+        Printf.fprintf chan "@*@emph{Form of expected instruction(s):} ";
+        print_insns insn_names;
+        Printf.fprintf chan "\n"
+    end;
+    Printf.fprintf chan "@end itemize\n";
+    Printf.fprintf chan "\n\n"
+
+(* Document a group of intrinsics.  *)
+let document_group chan (group_title, group_extractor) =
+  (* Extract the rows in question from the ops table and then turn them
+     into a list of intrinsics.  *)
+  let intrinsics =
+    List.fold_left (fun got_so_far ->
+                    fun row ->
+                      match row with
+                        (_, _, _, _, _, elt_tys) ->
+                          List.fold_left (fun got_so_far' ->
+                                          fun elt_ty ->
+                                            (elt_ty, row) :: got_so_far')
+                                         got_so_far elt_tys
+                   ) [] (group_extractor ())
+  in
+    (* Emit the title for this group.  *)
+    Printf.fprintf chan "@subsubsection %s\n\n" group_title;
+    (* Emit a description of each intrinsic.  *)
+    List.iter (describe_intrinsic true chan) intrinsics;
+    (* Close this group.  *)
+    Printf.fprintf chan "\n\n"
+
+let gnu_header chan =
+  List.iter (fun s -> Printf.fprintf chan "%s\n" s) [
+  "@c Copyright (C) 2006 Free Software Foundation, Inc.";
+  "@c This is part of the GCC manual.";
+  "@c For copying conditions, see the file gcc.texi.";
+  "";
+  "@c This file is generated automatically using gcc/config/arm/neon-docgen.ml";
+  "@c Please do not edit manually."]
+
+(* Program entry point.  *)
+let _ =
+  if Array.length Sys.argv <> 2 then
+    failwith "Usage: neon-docgen <output filename>"
+  else
+  let file = Sys.argv.(1) in
+    try
+      let chan = open_out file in
+        gnu_header chan;
+        List.iter (document_group chan) intrinsic_groups;
+        close_out chan
+    with Sys_error sys ->
+      failwith ("Could not create output file " ^ file ^ ": " ^ sys)

Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,424 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Auto-generate ARM Neon intrinsics header file.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 2, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to the Free
+   Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   This is an O'Caml program.  The O'Caml compiler is available from:
+
+     http://caml.inria.fr/
+
+   Or from your favourite OS's friendly packaging system. Tested with version
+   3.09.2, though other versions will probably work too.
+  
+   Compile with:
+     ocamlc -c neon.ml
+     ocamlc -o neon-gen neon.cmo neon-gen.ml
+
+   Run with:
+     ./neon-gen > arm_neon.h
+*)
+
+open Neon
+
+(* The format codes used in the following functions are documented at:
+     http://caml.inria.fr/pub/docs/manual-ocaml/libref/Format.html\
+     #6_printflikefunctionsforprettyprinting
+   (one line, remove the backslash.)
+*)
+
+(* Following functions can be used to approximate GNU indentation style.  *)
+let start_function () =
+  Format.printf "@[<v 0>";
+  ref 0
+
+let end_function nesting =
+  match !nesting with
+    0 -> Format.printf "@;@;@]"
+  | _ -> failwith ("Bad nesting (ending function at level "
+                   ^ (string_of_int !nesting) ^ ")")
+   
+let open_braceblock nesting =
+  begin match !nesting with
+    0 -> Format.printf "@,@<0>{@[<v 2>@,"
+  | _ -> Format.printf "@,@[<v 2>  @<0>{@[<v 2>@,"
+  end;
+  incr nesting
+
+let close_braceblock nesting =
+  decr nesting;
+  match !nesting with
+    0 -> Format.printf "@]@,@<0>}"
+  | _ -> Format.printf "@]@,@<0>}@]"
+
+let print_function arity fnname body =
+  let ffmt = start_function () in
+  Format.printf "__extension__ static __inline ";
+  let inl = "__attribute__ ((__always_inline__))" in
+  begin match arity with
+    Arity0 ret ->
+      Format.printf "%s %s@,%s (void)" (string_of_vectype ret) inl fnname
+  | Arity1 (ret, arg0) ->
+      Format.printf "%s %s@,%s (%s __a)" (string_of_vectype ret) inl fnname
+                                        (string_of_vectype arg0)
+  | Arity2 (ret, arg0, arg1) ->
+      Format.printf "%s %s@,%s (%s __a, %s __b)"
+        (string_of_vectype ret) inl fnname (string_of_vectype arg0)
+	(string_of_vectype arg1)
+  | Arity3 (ret, arg0, arg1, arg2) ->
+      Format.printf "%s %s@,%s (%s __a, %s __b, %s __c)"
+        (string_of_vectype ret) inl fnname (string_of_vectype arg0)
+	(string_of_vectype arg1) (string_of_vectype arg2)
+  | Arity4 (ret, arg0, arg1, arg2, arg3) ->
+      Format.printf "%s %s@,%s (%s __a, %s __b, %s __c, %s __d)"
+        (string_of_vectype ret) inl fnname (string_of_vectype arg0)
+	(string_of_vectype arg1) (string_of_vectype arg2)
+        (string_of_vectype arg3)
+  end;
+  open_braceblock ffmt;
+  let rec print_lines = function
+    [] -> ()
+  | [line] -> Format.printf "%s" line
+  | line::lines -> Format.printf "%s@," line; print_lines lines in
+  print_lines body;
+  close_braceblock ffmt;
+  end_function ffmt
+
+let return_by_ptr features = List.mem ReturnPtr features
+
+let union_string num elts base =
+  let itype = inttype_for_array num elts in
+  let iname = string_of_inttype itype
+  and sname = string_of_vectype (T_arrayof (num, elts)) in
+  Printf.sprintf "union { %s __i; %s __o; } %s" sname iname base
+
+let rec signed_ctype = function
+    T_uint8x8 | T_poly8x8 -> T_int8x8
+  | T_uint8x16 | T_poly8x16 -> T_int8x16
+  | T_uint16x4 | T_poly16x4 -> T_int16x4
+  | T_uint16x8 | T_poly16x8 -> T_int16x8
+  | T_uint32x2 -> T_int32x2
+  | T_uint32x4 -> T_int32x4
+  | T_uint64x1 -> T_int64x1
+  | T_uint64x2 -> T_int64x2
+  (* Cast to types defined by mode in arm.c, not random types pulled in from
+     the <stdint.h> header in use. This fixes incompatible pointer errors when
+     compiling with C++.  *)
+  | T_uint8 | T_int8 -> T_intQI
+  | T_uint16 | T_int16 -> T_intHI
+  | T_uint32 | T_int32 -> T_intSI
+  | T_uint64 | T_int64 -> T_intDI
+  | T_poly8 -> T_intQI
+  | T_poly16 -> T_intHI
+  | T_arrayof (n, elt) -> T_arrayof (n, signed_ctype elt)
+  | T_ptrto elt -> T_ptrto (signed_ctype elt)
+  | T_const elt -> T_const (signed_ctype elt)
+  | x -> x
+
+let add_cast ctype cval =
+  let stype = signed_ctype ctype in
+  if ctype <> stype then
+    Printf.sprintf "(%s) %s" (string_of_vectype stype) cval
+  else
+    cval
+
+let cast_for_return to_ty = "(" ^ (string_of_vectype to_ty) ^ ")"
+
+(* Return a tuple of a list of declarations to go at the start of the function,
+   and a list of statements needed to return THING.  *)
+let return arity return_by_ptr thing =
+  match arity with
+    Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
+  | Arity4 (ret, _, _, _, _) ->
+    match ret with
+      T_arrayof (num, vec) ->
+        if return_by_ptr then
+          let sname = string_of_vectype ret in
+          [Printf.sprintf "%s __rv;" sname],
+          [thing ^ ";"; "return __rv;"]
+        else
+          let uname = union_string num vec "__rv" in
+          [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"]
+    | T_void -> [], [thing ^ ";"]
+    | _ ->
+        [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+
+let rec element_type ctype =
+  match ctype with
+    T_arrayof (_, v) -> element_type v
+  | _ -> ctype
+
+let params return_by_ptr ps =
+  let pdecls = ref [] in
+  let ptype t p =
+    match t with
+      T_arrayof (num, elts) ->
+        let uname = union_string num elts (p ^ "u") in
+        let decl = Printf.sprintf "%s = { %s };" uname p in
+        pdecls := decl :: !pdecls;
+        p ^ "u.__o"
+    | _ -> add_cast t p in
+  let plist = match ps with
+    Arity0 _ -> []
+  | Arity1 (_, t1) -> [ptype t1 "__a"]
+  | Arity2 (_, t1, t2) -> [ptype t1 "__a"; ptype t2 "__b"]
+  | Arity3 (_, t1, t2, t3) -> [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"]
+  | Arity4 (_, t1, t2, t3, t4) ->
+      [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"; ptype t4 "__d"] in
+  match ps with
+    Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
+  | Arity4 (ret, _, _, _, _) ->
+      if return_by_ptr then
+        !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist
+      else
+        !pdecls, plist
+
+let modify_params features plist =
+  let is_flipped =
+    List.exists (function Flipped _ -> true | _ -> false) features in
+  if is_flipped then
+    match plist with
+      [ a; b ] -> [ b; a ]
+    | _ ->
+      failwith ("Don't know how to flip args " ^ (String.concat ", " plist))
+  else
+    plist
+
+(* !!! Decide whether to add an extra information word based on the shape
+   form.  *)
+let extra_word shape features paramlist bits =
+  let use_word =
+    match shape with
+      All _ | Long | Long_noreg _ | Wide | Wide_noreg _ | Narrow
+    | By_scalar _ | Wide_scalar | Wide_lane | Binary_imm _ | Long_imm
+    | Narrow_imm -> true
+    | _ -> List.mem InfoWord features
+  in
+    if use_word then
+      paramlist @ [string_of_int bits]
+    else
+      paramlist
+
+(* Bit 0 represents signed (1) vs unsigned (0), or float (1) vs poly (0).
+   Bit 1 represents rounding (1) vs none (0)
+   Bit 2 represents floats & polynomials (1), or ordinary integers (0).  *)
+let infoword_value elttype features =
+  let bits02 =
+    match elt_class elttype with
+      Signed | ConvClass (Signed, _) | ConvClass (_, Signed) -> 0b001
+    | Poly -> 0b100
+    | Float -> 0b101
+    | _ -> 0b000
+  and rounding_bit = if List.mem Rounding features then 0b010 else 0b000 in
+  bits02 lor rounding_bit
+
+(* "Cast" type operations will throw an exception in mode_of_elt (actually in
+   elt_width, called from there). Deal with that here, and generate a suffix
+   with multiple modes (<to><from>).  *)
+let rec mode_suffix elttype shape =
+  try
+    let mode = mode_of_elt elttype shape in
+    string_of_mode mode
+  with MixedMode (dst, src) ->
+    let dstmode = mode_of_elt dst shape
+    and srcmode = mode_of_elt src shape in
+    string_of_mode dstmode ^ string_of_mode srcmode
+
+let print_variant opcode features shape name (ctype, asmtype, elttype) =
+  let bits = infoword_value elttype features in
+  let modesuf = mode_suffix elttype shape in
+  let return_by_ptr = return_by_ptr features in
+  let pdecls, paramlist = params return_by_ptr ctype in
+  let paramlist' = modify_params features paramlist in
+  let paramlist'' = extra_word shape features paramlist' bits in
+  let parstr = String.concat ", " paramlist'' in
+  let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)"
+                  (builtin_name features name) modesuf parstr in
+  let rdecls, stmts = return ctype return_by_ptr builtin in
+  let body = pdecls @ rdecls @ stmts
+  and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
+  print_function ctype fnname body
+
+(* When this function processes the element types in the ops table, it rewrites
+   them in a list of tuples (a,b,c):
+     a : C type as an "arity", e.g. Arity1 (T_poly8x8, T_poly8x8)
+     b : Asm type : a single, processed element type, e.g. P16. This is the
+         type which should be attached to the asm opcode.
+     c : Variant type : the unprocessed type for this variant (e.g. in add
+         instructions which don't care about the sign, b might be i16 and c
+         might be s16.)
+*)
+
+let print_op (opcode, features, shape, name, munge, types) =
+  let sorted_types = List.sort compare types in
+  let munged_types = List.map
+    (fun elt -> let c, asm = munge shape elt in c, asm, elt) sorted_types in
+  List.iter
+    (fun variant -> print_variant opcode features shape name variant)
+    munged_types
+  
+let print_ops ops =
+  List.iter print_op ops
+
+(* Output type definitions. Table entries are:
+     cbase : "C" name for the type.
+     abase : "ARM" base name for the type (i.e. int in int8x8_t).
+     esize : element size.
+     enum : element count.
+   We can't really distinguish between polynomial types and integer types in
+   the C type system, I don't think, which may allow the user to make mistakes
+   without warnings from the compiler.
+   FIXME: It's probably better to use stdint.h names here.
+*)
+
+let deftypes () =
+  let typeinfo = [
+    (* Doubleword vector types.  *)
+    "__builtin_neon_qi", "int", 8, 8;
+    "__builtin_neon_hi", "int", 16, 4;
+    "__builtin_neon_si", "int", 32, 2;
+    "__builtin_neon_di", "int", 64, 1;
+    "__builtin_neon_sf", "float", 32, 2;
+    "__builtin_neon_poly8", "poly", 8, 8;
+    "__builtin_neon_poly16", "poly", 16, 4;
+    "__builtin_neon_uqi", "uint", 8, 8;
+    "__builtin_neon_uhi", "uint", 16, 4;
+    "__builtin_neon_usi", "uint", 32, 2;
+    "__builtin_neon_udi", "uint", 64, 1;
+    
+    (* Quadword vector types.  *)
+    "__builtin_neon_qi", "int", 8, 16;
+    "__builtin_neon_hi", "int", 16, 8;
+    "__builtin_neon_si", "int", 32, 4;
+    "__builtin_neon_di", "int", 64, 2;
+    "__builtin_neon_sf", "float", 32, 4;
+    "__builtin_neon_poly8", "poly", 8, 16;
+    "__builtin_neon_poly16", "poly", 16, 8;
+    "__builtin_neon_uqi", "uint", 8, 16;
+    "__builtin_neon_uhi", "uint", 16, 8;
+    "__builtin_neon_usi", "uint", 32, 4;
+    "__builtin_neon_udi", "uint", 64, 2
+  ] in
+  List.iter
+    (fun (cbase, abase, esize, enum) ->
+      let attr =
+        match enum with
+          1 -> ""
+        | _ -> Printf.sprintf "\t__attribute__ ((__vector_size__ (%d)))"
+                              (esize * enum / 8) in
+      Format.printf "typedef %s %s%dx%d_t%s;@\n" cbase abase esize enum attr)
+    typeinfo;
+  Format.print_newline ();
+  (* Extra types not in <stdint.h>.  *)
+  Format.printf "typedef __builtin_neon_sf float32_t;\n";
+  Format.printf "typedef __builtin_neon_poly8 poly8_t;\n";
+  Format.printf "typedef __builtin_neon_poly16 poly16_t;\n"
+
+(* Output structs containing arrays, for load & store instructions etc.  *)
+
+let arrtypes () =
+  let typeinfo = [
+    "int", 8;    "int", 16;
+    "int", 32;   "int", 64;
+    "uint", 8;   "uint", 16;
+    "uint", 32;  "uint", 64;
+    "float", 32; "poly", 8;
+    "poly", 16
+  ] in
+  let writestruct elname elsize regsize arrsize =
+    let elnum = regsize / elsize in
+    let structname =
+      Printf.sprintf "%s%dx%dx%d_t" elname elsize elnum arrsize in
+    let sfmt = start_function () in
+    Format.printf "typedef struct %s" structname;
+    open_braceblock sfmt;
+    Format.printf "%s%dx%d_t val[%d];" elname elsize elnum arrsize;
+    close_braceblock sfmt;
+    Format.printf " %s;" structname;
+    end_function sfmt;
+  in
+    for n = 2 to 4 do
+      List.iter
+        (fun (elname, elsize) ->
+          writestruct elname elsize 64 n;
+          writestruct elname elsize 128 n)
+        typeinfo
+    done
+
+let print_lines = List.iter (fun s -> Format.printf "%s@\n" s)
+
+(* Do it.  *)
+
+let _ =
+  print_lines [
+"/* ARM NEON intrinsics include file. This file is generated automatically";
+"   using neon-gen.ml.  Please do not edit manually.";
+"";
+"   Copyright (C) 2006, 2007 Free Software Foundation, Inc.";
+"   Contributed by CodeSourcery.";
+"";
+"   This file is part of GCC.";
+"";
+"   GCC is free software; you can redistribute it and/or modify it";
+"   under the terms of the GNU General Public License as published";
+"   by the Free Software Foundation; either version 2, or (at your";
+"   option) any later version.";
+"";
+"   GCC is distributed in the hope that it will be useful, but WITHOUT";
+"   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY";
+"   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public";
+"   License for more details.";
+"";
+"   You should have received a copy of the GNU General Public License";
+"   along with GCC; see the file COPYING.  If not, write to the";
+"   Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston,";
+"   MA 02110-1301, USA.  */";
+"";
+"/* As a special exception, if you include this header file into source";
+"   files compiled by GCC, this header file does not by itself cause";
+"   the resulting executable to be covered by the GNU General Public";
+"   License.  This exception does not however invalidate any other";
+"   reasons why the executable file might be covered by the GNU General";
+"   Public License.  */";
+"";
+"#ifndef _GCC_ARM_NEON_H";
+"#define _GCC_ARM_NEON_H 1";
+"";
+"#ifndef __ARM_NEON__";
+"#error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h";
+"#else";
+"";
+"#ifdef __cplusplus";
+"extern \"C\" {";
+"#endif";
+"";
+"#include <stdint.h>";
+""];
+  deftypes ();
+  arrtypes ();
+  Format.print_newline ();
+  print_ops ops;
+  Format.print_newline ();
+  print_ops reinterp;
+  print_lines [
+"#ifdef __cplusplus";
+"}";
+"#endif";
+"#endif";
+"#endif"]

Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,498 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Emission of the core of the Cortex-A8 NEON scheduling description.
+   Copyright (C) 2007 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 2, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to the Free
+   Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+*)
+
+(* This scheduling description generator works as follows.
+   - Each group of instructions has source and destination requirements
+     specified.  The source requirements may be specified using
+     Source (the stage at which all source operands not otherwise
+     described are read), Source_m (the stage at which Rm operands are
+     read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
+   - For each group of instructions the earliest stage where a source
+     operand may be required is calculated.
+   - Each group of instructions is selected in turn as a producer.
+     The latencies between this group and every other group are then
+     calculated, yielding up to four values for each combination:
+	1. Producer -> consumer Rn latency
+	2. Producer -> consumer Rm latency
+	3. Producer -> consumer Rd (as a source) latency
+	4. Producer -> consumer worst-case latency.
+     Value 4 is calculated from the destination availability requirements
+     of the consumer and the earliest source availability requirements
+     of the producer.
+   - The largest Value 4 calculated for the current producer is the
+     worse-case latency, L, for that instruction group.  This value is written
+     out in a define_insn_reservation for the producer group.
+   - For each producer and consumer pair, the latencies calculated above
+     are collated.  The average (of up to four values) is calculated and
+     if this average is different from the worst-case latency, an
+     unguarded define_bypass construction is issued for that pair.
+     (For each pair only one define_bypass construction will be emitted,
+     and at present we do not emit specific guards.)
+*)
+
+open Utils
+
+let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
+    and n7 = 7 and n8 = 8 and n9 = 9
+
+type availability = Source of int
+                  | Source_n of int
+                  | Source_m of int
+                  | Source_d of int
+                  | Dest of int
+		  | Dest_n_after of int * int
+
+type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
+
+(* Reservation behaviours.  All but the last row here correspond to one
+   pipeline each.  Each constructor will correspond to one
+   define_reservation.  *)
+type reservation =
+  Mul | Mul_2cycle | Mul_4cycle
+| Shift | Shift_2cycle
+| ALU | ALU_2cycle
+| Fmul | Fmul_2cycle
+| Fadd | Fadd_2cycle
+(* | VFP *)
+| Permute of int
+| Ls of int
+| Fmul_then_fadd | Fmul_then_fadd_2
+
+(* This table must be kept as short as possible by conflating
+   entries with the same availability behaviour.
+
+   First components: instruction group names
+   Second components: availability requirements, in the order in which
+   they should appear in the comments in the .md file.
+   Third components: reservation info
+*)
+let availability_table = [
+  (* NEON integer ALU instructions.  *)
+  (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
+     veor vbic vorn ddd qqq *)
+  "neon_int_1", [Source n2; Dest n3], ALU;
+  (* vadd vsub qqd vsub ddd qqq *)
+  "neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
+  (* vsum vneg dd qq vadd vsub qdd *)
+  "neon_int_3", [Source n1; Dest n3], ALU;
+  (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
+  (* vhadd vrhadd vqadd vtst ddd qqq *)
+  "neon_int_4", [Source n2; Dest n4], ALU;
+  (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
+  "neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
+  (* vqneg vqabs dd qq *)
+  "neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
+  (* vmov vmvn *)
+  "neon_vmov", [Dest n3], ALU;
+  (* vaba *)
+  "neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
+  "neon_vaba_qqq",
+    [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
+  (* vsma *)
+  "neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
+
+  (* NEON integer multiply instructions.  *)
+  (* vmul, vqdmlh, vqrdmlh *)
+  (* vmul, vqdmul, qdd 16/8 long 32/16 long *)
+  "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
+  "neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
+  (* vmul, vqdmul again *)
+  "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
+    [Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
+  (* vmla, vmls *)
+  "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
+    [Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
+  "neon_mla_qqq_8_16",
+    [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
+  "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
+    [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
+  "neon_mla_qqq_32_qqd_32_scalar",
+    [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
+  (* vmul, vqdmulh, vqrdmulh *)
+  (* vmul, vqdmul *)
+  "neon_mul_ddd_16_scalar_32_16_long_scalar",
+    [Source_n n2; Source_m n1; Dest n6], Mul;
+  "neon_mul_qqd_32_scalar",
+    [Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
+  (* vmla, vmls *)
+  (* vmla, vmla, vqdmla, vqdmls *)
+  "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
+    [Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
+
+  (* NEON integer shift instructions.  *)
+  (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
+  "neon_shift_1", [Source n1; Dest n3], Shift;
+  (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
+     vqshl_vrshl_vqrshl_ddd *)
+  "neon_shift_2", [Source n1; Dest n4], Shift;
+  (* vsli, vsri and vshl for qqq *)
+  "neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
+  "neon_vshl_ddd", [Source n1; Dest n1], Shift;
+  "neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
+    Shift_2cycle;
+  "neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
+
+  (* NEON floating-point instructions.  *)
+  (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
+  (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
+  "neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
+  "neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
+    Fadd_2cycle;
+  (* vsum, fvmx, vfmn *)
+  "neon_fp_vsum", [Source n1; Dest n5], Fadd;
+  "neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
+  "neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
+    Fmul_2cycle;
+  (* vmla, vmls *)
+  "neon_fp_vmla_ddd",
+    [Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
+  "neon_fp_vmla_qqq",
+    [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
+    Fmul_then_fadd_2;
+  "neon_fp_vmla_ddd_scalar",
+    [Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
+  "neon_fp_vmla_qqq_scalar",
+    [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
+    Fmul_then_fadd_2;
+  "neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
+  "neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
+    Fmul_then_fadd_2;
+
+  (* NEON byte permute instructions.  *)
+  (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
+  "neon_bp_simple", [Source n1; Dest n2], Permute 1;
+  (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
+     similarly for vtbx *)
+  "neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
+  (* all the rest *)
+  "neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
+
+  (* NEON load/store instructions.  *)
+  "neon_ldr", [Dest n1], Ls 1;
+  "neon_str", [Source n1], Ls 1;
+  "neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
+  "neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
+  "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
+  "neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
+  "neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
+  "neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
+  "neon_vst1_3_4_regs", [Source n1], Ls 3;
+  "neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
+  "neon_vst3_vst4", [Source n1], Ls 4;
+  "neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
+  "neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
+  "neon_vst1_vst2_lane", [Source n1], Ls 2;
+  "neon_vst3_vst4_lane", [Source n1], Ls 3;
+  "neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
+
+  (* NEON register transfer instructions.  *)
+  "neon_mcr", [Dest n2], Permute 1;
+  "neon_mcr_2_mcrr", [Dest n2], Permute 2;
+  (* MRC instructions are in the .tpl file.  *)
+]
+
+(* Augment the tuples in the availability table with an extra component
+   that describes the earliest stage where a source operand may be
+   required.  (It is also possible that an entry in the table has no
+   source requirements.)  *)
+let calculate_sources =
+  List.map (fun (name, avail, res) ->
+              let earliest_stage =
+                List.fold_left
+                  (fun cur -> fun info ->
+                     match info with
+                       Source stage
+                     | Source_n stage
+                     | Source_m stage
+                     | Source_d stage ->
+                         (match cur with
+                           None -> Some stage
+                         | Some stage' when stage < stage' -> Some stage
+                         | _ -> cur)
+                     | _ -> cur) None avail
+              in
+                (name, avail, res, earliest_stage))
+
+(* Find the stage, if any, at the end of which a group produces a result.  *)
+let find_dest (attr, avail, _, _) =
+  try
+    find_with_result
+      (fun av -> match av with
+                   Dest st -> Some (Some st)
+                 | Dest_n_after (after, st) -> Some (Some (after + st))
+                 | _ -> None) avail
+  with Not_found -> None 
+
+(* Find the worst-case latency between a producer and a consumer.  *)
+let worst_case_latency producer (_, _, _, earliest_required) =
+  let dest = find_dest producer in
+    match earliest_required, dest with
+      None, _ ->
+        (* The consumer doesn't have any source requirements.  *)
+        None
+    | _, None ->
+        (* The producer doesn't produce any results (e.g. a store insn).  *)
+        None
+    | Some consumed, Some produced -> Some (produced - consumed + 1)
+
+(* Helper function for below.  *)
+let latency_calc f producer (_, avail, _, _) =
+  try
+    let source_avail = find_with_result f avail in
+      match find_dest producer with
+        None ->
+          (* The producer does not produce a result.  *)
+          Some 0
+      | Some produced ->
+          let latency = produced - source_avail + 1 in
+            (* Latencies below zero are raised to zero since we don't have
+               delay slots.  *)
+            if latency < 0 then Some 0 else Some latency
+  with Not_found -> None
+
+(* Find any Rm latency between a producer and a consumer.  If no
+   Rm source requirement is explicitly specified for the consumer,
+   return "positive infinity".  Also return "positive infinity" if
+   the latency matches the supplied worst-case latency for this
+   producer.  *)
+let get_m_latency producer consumer =
+  match latency_calc (fun av -> match av with Source_m stage -> Some stage
+                                            | _ -> None) producer consumer
+  with None -> [] | Some latency -> [(Guard_only_m, latency)]
+
+(* Likewise for Rn.  *)
+let get_n_latency producer consumer =
+  match latency_calc (fun av -> match av with Source_n stage -> Some stage
+                                            | _ -> None) producer consumer
+  with None -> [] | Some latency -> [(Guard_only_n, latency)]
+
+(* Likewise for Rd.  *)
+let get_d_latency producer consumer =
+  match 
+    latency_calc (fun av -> match av with Source_d stage -> Some stage
+                                        | _ -> None) producer consumer
+  with None -> [] | Some latency -> [(Guard_only_d, latency)]
+
+(* Given a producer and a consumer, work out the latency of the producer
+   to the consumer in each of the four cases (availability information
+   permitting) identified at the top of this file.  Return the
+   consumer, the worst-case unguarded latency and any guarded latencies.  *)
+let calculate_latencies producer consumer =
+  let worst = worst_case_latency producer consumer in
+  let m_latency = get_m_latency producer consumer in
+  let n_latency = get_n_latency producer consumer in
+  let d_latency = get_d_latency producer consumer in
+    (consumer, worst, m_latency @ n_latency @ d_latency)
+
+(* Helper function for below.  *)
+let pick_latency largest worst guards =
+  let guards =
+    match worst with
+      None -> guards
+    | Some worst -> (Guard_none, worst) :: guards
+  in
+  if List.length guards = 0 then None else
+    let total_latency =
+      List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
+    in
+    let average_latency = (float_of_int total_latency) /.
+                          (float_of_int (List.length guards)) in
+    let rounded_latency = int_of_float (ceil average_latency) in
+      if rounded_latency = largest then None
+      else Some (Guard_none, rounded_latency)
+
+(* Collate all bypasses for a particular producer as required in
+   worst_case_latencies_and_bypasses.  (By this stage there is a maximum
+   of one bypass from this producer to any particular consumer listed
+   in LATENCIES.)  Use a hash table to collate bypasses with the
+   same latency and guard.  *)
+let collate_bypasses (producer_name, _, _, _) largest latencies =
+  let ht = Hashtbl.create 42 in
+  let keys = ref [] in
+    List.iter (
+      fun ((consumer, _, _, _), worst, guards) ->
+        (* Find out which latency to use.  Ignoring latencies that match
+           the *overall* worst-case latency for this producer (which will
+           be in define_insn_reservation), we have to examine:
+	   1. the latency with no guard between this producer and this
+              consumer; and
+	   2. any guarded latency.  *)
+        let guard_latency_opt = pick_latency largest worst guards in
+          match guard_latency_opt with
+            None -> ()
+          | Some (guard, latency) ->
+            begin
+              (if (try ignore (Hashtbl.find ht (guard, latency)); false
+                   with Not_found -> true) then
+                 keys := (guard, latency) :: !keys);
+              Hashtbl.add ht (guard, latency) consumer
+            end
+    ) latencies;
+    (* The hash table now has bypasses collated so that ones with the
+       same latency and guard have the same keys.  Walk through all the
+       keys, extract the associated bypasses, and concatenate the names
+       of the consumers for each bypass.  *)
+    List.map (
+      fun ((guard, latency) as key) ->
+        let consumers = Hashtbl.find_all ht key in
+          (producer_name,
+           String.concat ",\\\n               " consumers,
+           latency,
+           guard)
+      ) !keys
+
+(* For every producer, find the worst-case latency between it and
+   *any* consumer.  Also determine (if such a thing exists) the
+   lowest-latency bypass from each producer to each consumer.  Group
+   the output in such a way that all bypasses with the same producer
+   and latency are together, and so that bypasses with the worst-case
+   latency are ignored.  *)
+let worst_case_latencies_and_bypasses =
+  let rec f (worst_acc, bypasses_acc) prev xs =
+    match xs with
+      [] -> (worst_acc, bypasses_acc)
+    | ((producer_name, producer_avail, res_string, _) as producer)::next ->
+      (* For this particular producer, work out the latencies between
+         it and every consumer.  *)
+      let latencies =
+        List.fold_left (fun acc -> fun consumer ->
+                          (calculate_latencies producer consumer) :: acc)
+                       [] (prev @ xs)
+      in
+        (* Now work out what the overall worst case latency was for this
+           particular producer.  *)
+        match latencies with
+          [] -> assert false
+        | _ ->
+          let comp_fn (_, l1, _) (_, l2, _) =
+            if l1 > l2 then -1 else if l1 = l2 then 0 else 1
+          in
+          let largest =
+            match List.hd (List.sort comp_fn latencies) with
+              (_, None, _) -> 0 (* Producer has no consumers. *)
+            | (_, Some worst, _) -> worst
+          in
+          (* Having got the largest latency, collect all bypasses for
+             this producer and filter out those with that larger
+             latency.  Record the others for later emission.  *)
+          let bypasses = collate_bypasses producer largest latencies in
+            (* Go on to process remaining producers, having noted
+               the result for this one.  *)
+            f ((producer_name, producer_avail, largest,
+                res_string) :: worst_acc,
+               bypasses @ bypasses_acc)
+              (prev @ [producer]) next
+  in
+    f ([], []) []
+
+(* Emit a helpful comment for a define_insn_reservation.  *)
+let write_comment producer avail =
+  let seen_source = ref false in
+  let describe info =
+    let read = if !seen_source then "" else "read " in
+    match info with
+      Source stage ->
+        seen_source := true;
+	Printf.printf "%stheir source operands at N%d" read stage
+    | Source_n stage ->
+        seen_source := true;
+	Printf.printf "%stheir (D|Q)n operands at N%d" read stage
+    | Source_m stage ->
+        seen_source := true;
+	Printf.printf "%stheir (D|Q)m operands at N%d" read stage
+    | Source_d stage ->
+	Printf.printf "%stheir (D|Q)d operands at N%d" read stage
+    | Dest stage ->
+	Printf.printf "produce a result at N%d" stage
+    | Dest_n_after (after, stage) ->
+	Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
+  in
+    Printf.printf ";; Instructions using this reservation ";
+    let rec f infos x =
+      let sep = if x mod 2 = 1 then "" else "\n;;" in
+      match infos with
+        [] -> assert false
+      | [info] -> describe info; Printf.printf ".\n"
+      | info::(_::[] as infos) ->
+          describe info; Printf.printf ", and%s " sep; f infos (x+1)
+      | info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
+    in
+      f avail 0
+
+(* Emit a define_insn_reservation for each producer.  The latency
+   written in will be its worst-case latency.  *)
+let emit_insn_reservations =
+  List.iter (
+     fun (producer, avail, latency, reservation) ->
+        write_comment producer avail;
+        Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
+        Printf.printf "  (and (eq_attr \"tune\" \"cortexa8\")\n";
+        Printf.printf "       (eq_attr \"neon_type\" \"%s\"))\n" producer;
+        let str =
+          match reservation with
+	    Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
+	  | Shift -> "dp" | Shift_2cycle -> "dp_2"
+	  | ALU -> "dp" | ALU_2cycle -> "dp_2"
+	  | Fmul -> "dp" | Fmul_2cycle -> "dp_2"
+	  | Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
+	  | Ls 1 -> "ls"
+          | Ls n -> "ls_" ^ (string_of_int n)
+	  | Permute 1 -> "perm"
+          | Permute n -> "perm_" ^ (string_of_int n)
+	  | Fmul_then_fadd -> "fmul_then_fadd"
+	  | Fmul_then_fadd_2 -> "fmul_then_fadd_2"
+        in
+          Printf.printf "  \"cortex_a8_neon_%s\")\n\n" str
+    )
+
+(* Given a guard description, return the name of the C function to
+   be used as the guard for define_bypass.  *)
+let guard_fn g =
+  match g with
+    Guard_only_m -> "arm_neon_only_m_dependency"
+  | Guard_only_n -> "arm_neon_only_n_dependency"
+  | Guard_only_d -> "arm_neon_only_d_dependency"
+  | Guard_none -> assert false
+
+(* Emit a define_bypass for each bypass.  *)
+let emit_bypasses =
+  List.iter (
+      fun (producer, consumers, latency, guard) ->
+        Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
+        if guard = Guard_none then
+          Printf.printf "               \"%s\")\n\n" consumers
+        else
+          begin
+            Printf.printf "               \"%s\"\n" consumers;
+            Printf.printf "               \"%s\")\n\n" (guard_fn guard)
+          end
+    )
+
+(* Program entry point.  *)
+let main =
+  let table = calculate_sources availability_table in
+  let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
+    emit_insn_reservations (List.rev worst_cases);
+    Printf.printf ";; Exceptions to the default latencies.\n\n";
+    emit_bypasses bypasses
+

Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,274 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Auto-generate ARM Neon intrinsics tests.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 2, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to the Free
+   Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   This is an O'Caml program.  The O'Caml compiler is available from:
+
+     http://caml.inria.fr/
+
+   Or from your favourite OS's friendly packaging system. Tested with version
+   3.09.2, though other versions will probably work too.
+  
+   Compile with:
+     ocamlc -c neon.ml
+     ocamlc -o neon-testgen neon.cmo neon-testgen.ml
+*)
+
+open Neon
+
+type c_type_flags = Pointer | Const
+
+(* Open a test source file.  *)
+let open_test_file dir name =
+  try
+    open_out (dir ^ "/" ^ name ^ ".c")
+  with Sys_error str ->
+    failwith ("Could not create test source file " ^ name ^ ": " ^ str)
+
+(* Emit prologue code to a test source file.  *)
+let emit_prologue chan test_name =
+  Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic.  */\n" test_name;
+  Printf.fprintf chan "/* This file was autogenerated by neon-testgen.  */\n\n";
+  Printf.fprintf chan "/* { dg-do assemble } */\n";
+  Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n";
+  Printf.fprintf chan
+                 "/* { dg-options \"-save-temps -O0 -mfpu=neon -mfloat-abi=softfp\" } */\n";
+  Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n";
+  Printf.fprintf chan "void test_%s (void)\n{\n" test_name
+
+(* Emit declarations of local variables that are going to be passed
+   to an intrinsic, together with one to take a returned value if needed.  *)
+let emit_automatics chan c_types =
+  let emit () =
+    ignore (
+      List.fold_left (fun arg_number -> fun (flags, ty) ->
+                        let pointer_bit =
+                          if List.mem Pointer flags then "*" else ""
+                        in
+                          (* Const arguments to builtins are directly
+                             written in as constants.  *)
+                          if not (List.mem Const flags) then
+                            Printf.fprintf chan "  %s %sarg%d_%s;\n"
+                                           ty pointer_bit arg_number ty;
+                        arg_number + 1)
+                     0 (List.tl c_types))
+  in
+    match c_types with
+      (_, return_ty) :: tys ->
+        if return_ty <> "void" then
+          (* The intrinsic returns a value.  *)
+          (Printf.fprintf chan "  %s out_%s;\n" return_ty return_ty;
+           emit ())
+        else
+          (* The intrinsic does not return a value.  *)
+          emit ()
+    | _ -> assert false
+
+(* Emit code to call an intrinsic.  *)
+let emit_call chan const_valuator c_types name elt_ty =
+  (if snd (List.hd c_types) <> "void" then
+     Printf.fprintf chan "  out_%s = " (snd (List.hd c_types))
+   else
+     Printf.fprintf chan "  ");
+  Printf.fprintf chan "%s_%s (" (intrinsic_name name) (string_of_elt elt_ty);
+  let print_arg chan arg_number (flags, ty) =
+    (* If the argument is of const type, then directly write in the
+       constant now.  *)
+    if List.mem Const flags then
+      match const_valuator with
+        None ->
+          if List.mem Pointer flags then
+            Printf.fprintf chan "0"
+          else
+            Printf.fprintf chan "1"
+      | Some f -> Printf.fprintf chan "%s" (string_of_int (f arg_number))
+    else
+      Printf.fprintf chan "arg%d_%s" arg_number ty
+  in
+  let rec print_args arg_number tys =
+    match tys with
+      [] -> ()
+    | [ty] -> print_arg chan arg_number ty
+    | ty::tys ->
+      print_arg chan arg_number ty;
+      Printf.fprintf chan ", ";
+      print_args (arg_number + 1) tys
+  in
+    print_args 0 (List.tl c_types);
+    Printf.fprintf chan ");\n"
+
+(* Emit epilogue code to a test source file.  *)
+let emit_epilogue chan features regexps =
+  let no_op = List.exists (fun feature -> feature = No_op) features in
+    Printf.fprintf chan "}\n\n";
+    (if not no_op then
+       List.iter (fun regexp ->
+                   Printf.fprintf chan
+                     "/* { dg-final { scan-assembler \"%s\" } } */\n" regexp)
+                regexps
+     else
+       ()
+    );
+    Printf.fprintf chan "/* { dg-final { cleanup-saved-temps } } */\n"
+
+(* Check a list of C types to determine which ones are pointers and which
+   ones are const.  *)
+let check_types tys =
+  let tys' =
+    List.map (fun ty ->
+                let len = String.length ty in
+                  if len > 2 && String.get ty (len - 2) = ' '
+                             && String.get ty (len - 1) = '*'
+                  then ([Pointer], String.sub ty 0 (len - 2))
+                  else ([], ty)) tys
+  in
+    List.map (fun (flags, ty) ->
+                if String.length ty > 6 && String.sub ty 0 6 = "const "
+                then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
+                else (flags, ty)) tys'
+
+(* Given an intrinsic shape, produce a regexp that will match
+   the right-hand sides of instructions generated by an intrinsic of
+   that shape.  *)
+let rec analyze_shape shape =
+  let rec n_things n thing =
+    match n with
+      0 -> []
+    | n -> thing :: (n_things (n - 1) thing)
+  in
+  let rec analyze_shape_elt elt =
+    match elt with
+      Dreg -> "\\[dD\\]\\[0-9\\]+"
+    | Qreg -> "\\[qQ\\]\\[0-9\\]+"
+    | Corereg -> "\\[rR\\]\\[0-9\\]+"
+    | Immed -> "#\\[0-9\\]+"
+    | VecArray (1, elt) ->
+        let elt_regexp = analyze_shape_elt elt in
+          "((\\\\\\{" ^ elt_regexp ^ "\\\\\\})|(" ^ elt_regexp ^ "))"
+    | VecArray (n, elt) ->
+      let elt_regexp = analyze_shape_elt elt in
+      let alt1 = elt_regexp ^ "-" ^ elt_regexp in
+      let alt2 = commas (fun x -> x) (n_things n elt_regexp) "" in
+        "\\\\\\{((" ^ alt1 ^ ")|(" ^ alt2 ^ "))\\\\\\}"
+    | (PtrTo elt | CstPtrTo elt) ->
+      "\\\\\\[" ^ (analyze_shape_elt elt) ^ "\\\\\\]"
+    | Element_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
+    | Element_of_qreg -> (analyze_shape_elt Qreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
+    | All_elements_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\\\\\]"
+  in
+    match shape with
+      All (n, elt) -> commas analyze_shape_elt (n_things n elt) ""
+    | Long -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Dreg) ^
+              ", " ^ (analyze_shape_elt Dreg)
+    | Long_noreg elt -> (analyze_shape_elt elt) ^ ", " ^ (analyze_shape_elt elt)
+    | Wide -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
+              ", " ^ (analyze_shape_elt Dreg)
+    | Wide_noreg elt -> analyze_shape (Long_noreg elt)
+    | Narrow -> (analyze_shape_elt Dreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
+                ", " ^ (analyze_shape_elt Qreg)
+    | Use_operands elts -> commas analyze_shape_elt (Array.to_list elts) ""
+    | By_scalar Dreg ->
+        analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |])
+    | By_scalar Qreg ->
+        analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |])
+    | By_scalar _ -> assert false
+    | Wide_lane ->
+        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+    | Wide_scalar ->
+        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+    | Pair_result elt ->
+      let elt_regexp = analyze_shape_elt elt in
+        elt_regexp ^ ", " ^ elt_regexp
+    | Unary_scalar _ -> "FIXME Unary_scalar"
+    | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |])
+    | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |])
+    | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |])
+
+(* Generate tests for one intrinsic.  *)
+let test_intrinsic dir opcode features shape name munge elt_ty =
+  (* Open the test source file.  *)
+  let test_name = name ^ (string_of_elt elt_ty) in
+  let chan = open_test_file dir test_name in
+  (* Work out what argument and return types the intrinsic has.  *)
+  let c_arity, new_elt_ty = munge shape elt_ty in
+  let c_types = check_types (strings_of_arity c_arity) in
+  (* Extract any constant valuator (a function specifying what constant
+     values are to be written into the intrinsic call) from the features
+     list.  *)
+  let const_valuator =
+    try
+      match (List.find (fun feature -> match feature with
+                                         Const_valuator _ -> true
+				       | _ -> false) features) with
+        Const_valuator f -> Some f
+      | _ -> assert false
+    with Not_found -> None
+  in
+  (* Work out what instruction name(s) to expect.  *)
+  let insns = get_insn_names features name in
+  let no_suffix = (new_elt_ty = NoElts) in
+  let insns =
+    if no_suffix then insns
+                 else List.map (fun insn ->
+                                  let suffix = string_of_elt_dots new_elt_ty in
+                                    insn ^ "\\." ^ suffix) insns
+  in
+  (* Construct a regexp to match against the expected instruction name(s).  *)
+  let insn_regexp =
+    match insns with
+      [] -> assert false
+    | [insn] -> insn
+    | _ ->
+      let rec calc_regexp insns cur_regexp =
+        match insns with
+          [] -> cur_regexp
+        | [insn] -> cur_regexp ^ "(" ^ insn ^ "))"
+        | insn::insns -> calc_regexp insns (cur_regexp ^ "(" ^ insn ^ ")|")
+      in calc_regexp insns "("
+  in
+  (* Construct regexps to match against the instructions that this
+     intrinsic expands to.  Watch out for any writeback character and
+     comments after the instruction.  *)
+  let regexps = List.map (fun regexp -> insn_regexp ^ "\\[ \t\\]+" ^ regexp ^
+			  "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
+                         (analyze_all_shapes features shape analyze_shape)
+  in
+    (* Emit file and function prologues.  *)
+    emit_prologue chan test_name;
+    (* Emit local variable declarations.  *)
+    emit_automatics chan c_types;
+    Printf.fprintf chan "\n";
+    (* Emit the call to the intrinsic.  *)
+    emit_call chan const_valuator c_types name elt_ty;
+    (* Emit the function epilogue and the DejaGNU scan-assembler directives.  *)
+    emit_epilogue chan features regexps;
+    (* Close the test file.  *)
+    close_out chan
+
+(* Generate tests for one element of the "ops" table.  *)
+let test_intrinsic_group dir (opcode, features, shape, name, munge, types) =
+  List.iter (test_intrinsic dir opcode features shape name munge) types
+
+(* Program entry point.  *)
+let _ =
+  let directory = if Array.length Sys.argv <> 1 then Sys.argv.(1) else "." in
+    List.iter (test_intrinsic_group directory) (reinterp @ ops)
+

Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon.md?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,4917 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM NEON coprocessor Machine Description
+;; Copyright (C) 2006 Free Software Foundation, Inc.
+;; Written by CodeSourcery.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.
+
+;; Constants for unspecs.
+(define_constants
+  [(UNSPEC_VPADD 65)
+   (UNSPEC_VPSMIN 66)
+   (UNSPEC_VPUMIN 67)
+   (UNSPEC_VPSMAX 68)
+   (UNSPEC_VPUMAX 69)
+   (UNSPEC_ASHIFT_SIGNED 70)
+   (UNSPEC_ASHIFT_UNSIGNED 71)
+   (UNSPEC_VADD 72)
+   (UNSPEC_VADDL 73)
+   (UNSPEC_VADDW 74)
+   (UNSPEC_VHADD 75)
+   (UNSPEC_VQADD 76)
+   (UNSPEC_VADDHN 77)
+   (UNSPEC_VABS 78)
+   (UNSPEC_VQABS 79)
+   (UNSPEC_VGET_LANE 80)
+   (UNSPEC_VSET_LANE 81)
+   (UNSPEC_VDUP_N 82)
+   (UNSPEC_VCOMBINE 83)
+   (UNSPEC_VGET_HIGH 84)
+   (UNSPEC_VGET_LOW 85)
+   (UNSPEC_VMOVN 87)
+   (UNSPEC_VQMOVN 88)
+   (UNSPEC_VQMOVUN 89)
+   (UNSPEC_VMOVL 90)
+   (UNSPEC_VMUL_LANE 91)
+   (UNSPEC_VMLA_LANE 92)
+   (UNSPEC_VMLAL_LANE 93)
+   (UNSPEC_VQDMLAL_LANE 94)
+   (UNSPEC_VMUL_N 95)
+   (UNSPEC_VCVT 96)
+   (UNSPEC_VEXT 97)
+   (UNSPEC_VREV64 98)
+   (UNSPEC_VREV32 99)
+   (UNSPEC_VREV16 100)
+   (UNSPEC_VBSL 101)
+   (UNSPEC_VLD1 102)
+   (UNSPEC_VLD1_LANE 103)
+   (UNSPEC_VLD1_DUP 104)
+   (UNSPEC_VST1 105)
+   (UNSPEC_VST1_LANE 106)
+   (UNSPEC_VSTRUCTDUMMY 107)
+   (UNSPEC_VLD2 108)
+   (UNSPEC_VLD2_LANE 109)
+   (UNSPEC_VLD2_DUP 110)
+   (UNSPEC_VST2 111)
+   (UNSPEC_VST2_LANE 112)
+   (UNSPEC_VLD3 113)
+   (UNSPEC_VLD3A 114)
+   (UNSPEC_VLD3B 115)
+   (UNSPEC_VLD3_LANE 116)
+   (UNSPEC_VLD3_DUP 117)
+   (UNSPEC_VST3 118)
+   (UNSPEC_VST3A 119)
+   (UNSPEC_VST3B 120)
+   (UNSPEC_VST3_LANE 121)
+   (UNSPEC_VLD4 122)
+   (UNSPEC_VLD4A 123)
+   (UNSPEC_VLD4B 124)
+   (UNSPEC_VLD4_LANE 125)
+   (UNSPEC_VLD4_DUP 126)
+   (UNSPEC_VST4 127)
+   (UNSPEC_VST4A 128)
+   (UNSPEC_VST4B 129)
+   (UNSPEC_VST4_LANE 130)
+   (UNSPEC_VTRN1 131)
+   (UNSPEC_VTRN2 132)
+   (UNSPEC_VTBL 133)
+   (UNSPEC_VTBX 134)
+   (UNSPEC_VAND 135)
+   (UNSPEC_VORR 136)
+   (UNSPEC_VEOR 137)
+   (UNSPEC_VBIC 138)
+   (UNSPEC_VORN 139)
+   (UNSPEC_VCVT_N 140)
+   (UNSPEC_VQNEG 142)
+   (UNSPEC_VMVN 143)
+   (UNSPEC_VCLS 144)
+   (UNSPEC_VCLZ 145)
+   (UNSPEC_VCNT 146)
+   (UNSPEC_VRECPE 147)
+   (UNSPEC_VRSQRTE 148)
+   (UNSPEC_VMUL 149)
+   (UNSPEC_VMLA 150)
+   (UNSPEC_VMLAL 151)
+   (UNSPEC_VMLS 152)
+   (UNSPEC_VMLSL 153)
+   (UNSPEC_VQDMULH 154)
+   (UNSPEC_VQDMLAL 155)
+   (UNSPEC_VQDMLSL 156)
+   (UNSPEC_VMULL 157)
+   (UNSPEC_VQDMULL 158)
+   (UNSPEC_VMLS_LANE 159)
+   (UNSPEC_VMLSL_LANE 160)
+   (UNSPEC_VQDMLSL_LANE 161)
+   (UNSPEC_VDUP_LANE 162)
+   (UNSPEC_VZIP1 163)
+   (UNSPEC_VZIP2 164)
+   (UNSPEC_VUZP1 165)
+   (UNSPEC_VUZP2 166)
+   (UNSPEC_VSRI 167)
+   (UNSPEC_VSLI 168)
+   (UNSPEC_VSRA_N 169)
+   (UNSPEC_VSHL_N 170)
+   (UNSPEC_VQSHL_N 171)
+   (UNSPEC_VQSHLU_N 172)
+   (UNSPEC_VSHLL_N 173)
+   (UNSPEC_VSHR_N 174)
+   (UNSPEC_VSHRN_N 175)
+   (UNSPEC_VQSHRN_N 176)
+   (UNSPEC_VQSHRUN_N 177)
+   (UNSPEC_VSUB 178)
+   (UNSPEC_VSUBL 179)
+   (UNSPEC_VSUBW 180)
+   (UNSPEC_VQSUB 181)
+   (UNSPEC_VHSUB 182)
+   (UNSPEC_VSUBHN 183)
+   (UNSPEC_VCEQ 184)
+   (UNSPEC_VCGE 185)
+   (UNSPEC_VCGT 186)
+   (UNSPEC_VCAGE 187)
+   (UNSPEC_VCAGT 188)
+   (UNSPEC_VTST 189)
+   (UNSPEC_VABD 190)
+   (UNSPEC_VABDL 191)
+   (UNSPEC_VABA 192)
+   (UNSPEC_VABAL 193)
+   (UNSPEC_VMAX 194)
+   (UNSPEC_VMIN 195)
+   (UNSPEC_VPADDL 196)
+   (UNSPEC_VPADAL 197)
+   (UNSPEC_VSHL 198)
+   (UNSPEC_VQSHL 199)
+   (UNSPEC_VPMAX 200)
+   (UNSPEC_VPMIN 201)
+   (UNSPEC_VRECPS 202)
+   (UNSPEC_VRSQRTS 203)
+   (UNSPEC_VMULL_LANE 204)
+   (UNSPEC_VQDMULL_LANE 205)
+   (UNSPEC_VQDMULH_LANE 206)])
+
+
+;; Double-width vector modes.
+(define_mode_macro VD [V8QI V4HI V2SI V2SF])
+
+;; Double-width vector modes plus 64-bit elements.
+(define_mode_macro VDX [V8QI V4HI V2SI V2SF DI])
+
+;; Same, without floating-point elements.
+(define_mode_macro VDI [V8QI V4HI V2SI])
+
+;; Quad-width vector modes.
+(define_mode_macro VQ [V16QI V8HI V4SI V4SF])
+
+;; Quad-width vector modes plus 64-bit elements.
+(define_mode_macro VQX [V16QI V8HI V4SI V4SF V2DI])
+
+;; Same, without floating-point elements.
+(define_mode_macro VQI [V16QI V8HI V4SI])
+
+;; Same, with TImode added, for moves.
+(define_mode_macro VQXMOV [V16QI V8HI V4SI V4SF V2DI TI])
+
+;; Opaque structure types wider than TImode.
+(define_mode_macro VSTRUCT [EI OI CI XI])
+
+;; Number of instructions needed to load/store struct elements. FIXME!
+(define_mode_attr V_slen [(EI "2") (OI "2") (CI "3") (XI "4")])
+
+;; Opaque structure types used in table lookups (except vtbl1/vtbx1).
+(define_mode_macro VTAB [TI EI OI])
+
+;; vtbl<n> suffix for above modes.
+(define_mode_attr VTAB_n [(TI "2") (EI "3") (OI "4")])
+
+;; Widenable modes.
+(define_mode_macro VW [V8QI V4HI V2SI])
+
+;; Narrowable modes.
+(define_mode_macro VN [V8HI V4SI V2DI])
+
+;; All supported vector modes (except singleton DImode).
+(define_mode_macro VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI])
+
+;; All supported vector modes (except those with 64-bit integer elements).
+(define_mode_macro VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
+
+;; Supported integer vector modes (not 64 bit elements).
+(define_mode_macro VDQIW [V8QI V16QI V4HI V8HI V2SI V4SI])
+
+;; Supported integer vector modes (not singleton DI)
+(define_mode_macro VDQI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
+
+;; Vector modes, including 64-bit integer elements.
+(define_mode_macro VDQX [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF DI V2DI])
+
+;; Vector modes including 64-bit integer elements, but no floats.
+(define_mode_macro VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
+
+;; Vector modes for float->int conversions.
+(define_mode_macro VCVTF [V2SF V4SF])
+
+;; Vector modes form int->float conversions.
+(define_mode_macro VCVTI [V2SI V4SI])
+
+;; Vector modes for doubleword multiply-accumulate, etc. insns.
+(define_mode_macro VMD [V4HI V2SI V2SF])
+
+;; Vector modes for quadword multiply-accumulate, etc. insns.
+(define_mode_macro VMQ [V8HI V4SI V4SF])
+
+;; Above modes combined.
+(define_mode_macro VMDQ [V4HI V2SI V2SF V8HI V4SI V4SF])
+
+;; As VMD, but integer modes only.
+(define_mode_macro VMDI [V4HI V2SI])
+
+;; As VMQ, but integer modes only.
+(define_mode_macro VMQI [V8HI V4SI])
+
+;; Above modes combined.
+(define_mode_macro VMDQI [V4HI V2SI V8HI V4SI])
+
+;; Modes with 8-bit and 16-bit elements.
+(define_mode_macro VX [V8QI V4HI V16QI V8HI])
+
+;; Modes with 8-bit elements.
+(define_mode_macro VE [V8QI V16QI])
+
+;; Modes with 64-bit elements only.
+(define_mode_macro V64 [DI V2DI])
+
+;; Modes with 32-bit elements only.
+(define_mode_macro V32 [V2SI V2SF V4SI V4SF])
+
+;; (Opposite) mode to convert to/from for above conversions.
+(define_mode_attr V_CVTTO [(V2SI "V2SF") (V2SF "V2SI")
+			   (V4SI "V4SF") (V4SF "V4SI")])
+
+;; Define element mode for each vector mode.
+(define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
+			  (V4HI "HI") (V8HI "HI")
+                          (V2SI "SI") (V4SI "SI")
+                          (V2SF "SF") (V4SF "SF")
+                          (DI "DI")   (V2DI "DI")])
+
+;; Mode of pair of elements for each vector mode, to define transfer
+;; size for structure lane/dup loads and stores.
+(define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI")
+			      (V4HI "SI") (V8HI "SI")
+                              (V2SI "V2SI") (V4SI "V2SI")
+                              (V2SF "V2SF") (V4SF "V2SF")
+                              (DI "V2DI")   (V2DI "V2DI")])
+
+;; Similar, for three elements.
+;; ??? Should we define extra modes so that sizes of all three-element
+;; accesses can be accurately represented?
+(define_mode_attr V_three_elem [(V8QI "SI")   (V16QI "SI")
+			        (V4HI "V4HI") (V8HI "V4HI")
+                                (V2SI "V4SI") (V4SI "V4SI")
+                                (V2SF "V4SF") (V4SF "V4SF")
+                                (DI "EI")     (V2DI "EI")])
+
+;; Similar, for four elements.
+(define_mode_attr V_four_elem [(V8QI "SI")   (V16QI "SI")
+			       (V4HI "V4HI") (V8HI "V4HI")
+                               (V2SI "V4SI") (V4SI "V4SI")
+                               (V2SF "V4SF") (V4SF "V4SF")
+                               (DI "OI")     (V2DI "OI")])
+
+;; Register width from element mode
+(define_mode_attr V_reg [(V8QI "P") (V16QI "q")
+                         (V4HI "P") (V8HI  "q")
+                         (V2SI "P") (V4SI  "q")
+                         (V2SF "P") (V4SF  "q")
+                         (DI   "P") (V2DI  "q")])
+
+;; Wider modes with the same number of elements.
+(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
+
+;; Narrower modes with the same number of elements.
+(define_mode_attr V_narrow [(V8HI "V8QI") (V4SI "V4HI") (V2DI "V2SI")])
+
+;; Modes with half the number of equal-sized elements.
+(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
+			  (V4SI  "V2SI") (V4SF "V2SF")
+                          (V2DI "DI")])
+
+;; Same, but lower-case.
+(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
+			  (V4SI  "v2si") (V4SF "v2sf")
+                          (V2DI "di")])
+
+;; Modes with twice the number of equal-sized elements.
+(define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
+			    (V2SI "V4SI") (V2SF "V4SF")
+                            (DI "V2DI")])
+
+;; Same, but lower-case.
+(define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
+			    (V2SI "v4si") (V2SF "v4sf")
+                            (DI "v2di")])
+
+;; Modes with double-width elements.
+(define_mode_attr V_double_width [(V8QI "V4HI") (V16QI "V8HI")
+				  (V4HI "V2SI") (V8HI "V4SI")
+				  (V2SI "DI")   (V4SI "V2DI")])
+
+;; Mode of result of comparison operations (and bit-select operand 1).
+(define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
+			        (V4HI "V4HI") (V8HI  "V8HI")
+                                (V2SI "V2SI") (V4SI  "V4SI")
+                                (V2SF "V2SI") (V4SF  "V4SI")
+                                (DI   "DI")   (V2DI  "V2DI")])
+
+;; Get element type from double-width mode, for operations where we don't care
+;; about signedness.
+(define_mode_attr V_if_elem [(V8QI "i8")  (V16QI "i8")
+			     (V4HI "i16") (V8HI  "i16")
+                             (V2SI "i32") (V4SI  "i32")
+                             (DI   "i64") (V2DI  "i64")
+			     (V2SF "f32") (V4SF  "f32")])
+
+;; Same, but for operations which work on signed values.
+(define_mode_attr V_s_elem [(V8QI "s8")  (V16QI "s8")
+			    (V4HI "s16") (V8HI  "s16")
+                            (V2SI "s32") (V4SI  "s32")
+                            (DI   "s64") (V2DI  "s64")
+			    (V2SF "f32") (V4SF  "f32")])
+
+;; Same, but for operations which work on unsigned values.
+(define_mode_attr V_u_elem [(V8QI "u8")  (V16QI "u8")
+			    (V4HI "u16") (V8HI  "u16")
+                            (V2SI "u32") (V4SI  "u32")
+                            (DI   "u64") (V2DI  "u64")
+                            (V2SF "f32") (V4SF  "f32")])
+
+;; Element types for extraction of unsigned scalars.
+(define_mode_attr V_uf_sclr [(V8QI "u8")  (V16QI "u8")
+			     (V4HI "u16") (V8HI "u16")
+                             (V2SI "32") (V4SI "32")
+                             (V2SF "32") (V4SF "32")])
+
+(define_mode_attr V_sz_elem [(V8QI "8")  (V16QI "8")
+			     (V4HI "16") (V8HI  "16")
+                             (V2SI "32") (V4SI  "32")
+                             (DI   "64") (V2DI  "64")
+			     (V2SF "32") (V4SF  "32")])
+
+;; Element sizes for duplicating ARM registers to all elements of a vector.
+(define_mode_attr VD_dup [(V8QI "8") (V4HI "16") (V2SI "32") (V2SF "32")])
+
+;; Opaque integer types for results of pair-forming intrinsics (vtrn, etc.)
+(define_mode_attr V_PAIR [(V8QI "TI") (V16QI "OI")
+			  (V4HI "TI") (V8HI  "OI")
+                          (V2SI "TI") (V4SI  "OI")
+                          (V2SF "TI") (V4SF  "OI")
+                          (DI   "TI") (V2DI  "OI")])
+
+;; Same, but lower-case.
+(define_mode_attr V_pair [(V8QI "ti") (V16QI "oi")
+			  (V4HI "ti") (V8HI  "oi")
+                          (V2SI "ti") (V4SI  "oi")
+                          (V2SF "ti") (V4SF  "oi")
+                          (DI   "ti") (V2DI  "oi")])
+
+;; Operations on two halves of a quadword vector.
+(define_code_macro vqh_ops [plus smin smax umin umax])
+
+;; Same, without unsigned variants (for use with *SFmode pattern).
+(define_code_macro vqhs_ops [plus smin smax])
+
+;; Assembler mnemonics for above codes.
+(define_code_attr VQH_mnem [(plus "vadd") (smin "vmin") (smax "vmax")
+			    (umin "vmin") (umax "vmax")])
+
+;; Signs of above, where relevant.
+(define_code_attr VQH_sign [(plus "i") (smin "s") (smax "s") (umin "u")
+			    (umax "u")])
+
+;; Extra suffix on some 64-bit insn names (to avoid collision with standard
+;; names which we don't want to define).
+(define_mode_attr V_suf64 [(V8QI "") (V16QI "")
+			   (V4HI "") (V8HI "")
+                           (V2SI "") (V4SI "")
+                           (V2SF "") (V4SF "")
+                           (DI "_neon") (V2DI "")])
+
+;; Scalars to be presented to scalar multiplication instructions
+;; must satisfy the following constraints.
+;; 1. If the mode specifies 16-bit elements, the scalar must be in D0-D7.
+;; 2. If the mode specifies 32-bit elements, the scalar must be in D0-D15.
+;; This mode attribute is used to obtain the correct register constraints.
+(define_mode_attr scalar_mul_constraint [(V4HI "x") (V2SI "t") (V2SF "t")
+                                         (V8HI "x") (V4SI "t") (V4SF "t")])
+
+;; Attribute used to permit string comparisons against <VQH_mnem> in
+;; neon_type attribute definitions.
+(define_attr "vqh_mnem" "vadd,vmin,vmax" (const_string "vadd"))
+
+;; Classification of NEON instructions for scheduling purposes.
+;; Do not set this attribute and the "type" attribute together in
+;; any one instruction pattern.
+(define_attr "neon_type"
+   "neon_int_1,\
+   neon_int_2,\
+   neon_int_3,\
+   neon_int_4,\
+   neon_int_5,\
+   neon_vqneg_vqabs,\
+   neon_vmov,\
+   neon_vaba,\
+   neon_vsma,\
+   neon_vaba_qqq,\
+   neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+   neon_mul_qqq_8_16_32_ddd_32,\
+   neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar,\
+   neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+   neon_mla_qqq_8_16,\
+   neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long,\
+   neon_mla_qqq_32_qqd_32_scalar,\
+   neon_mul_ddd_16_scalar_32_16_long_scalar,\
+   neon_mul_qqd_32_scalar,\
+   neon_mla_ddd_16_scalar_qdd_32_16_long_scalar,\
+   neon_shift_1,\
+   neon_shift_2,\
+   neon_shift_3,\
+   neon_vshl_ddd,\
+   neon_vqshl_vrshl_vqrshl_qqq,\
+   neon_vsra_vrsra,\
+   neon_fp_vadd_ddd_vabs_dd,\
+   neon_fp_vadd_qqq_vabs_qq,\
+   neon_fp_vsum,\
+   neon_fp_vmul_ddd,\
+   neon_fp_vmul_qqd,\
+   neon_fp_vmla_ddd,\
+   neon_fp_vmla_qqq,\
+   neon_fp_vmla_ddd_scalar,\
+   neon_fp_vmla_qqq_scalar,\
+   neon_fp_vrecps_vrsqrts_ddd,\
+   neon_fp_vrecps_vrsqrts_qqq,\
+   neon_bp_simple,\
+   neon_bp_2cycle,\
+   neon_bp_3cycle,\
+   neon_ldr,\
+   neon_str,\
+   neon_vld1_1_2_regs,\
+   neon_vld1_3_4_regs,\
+   neon_vld2_2_regs_vld1_vld2_all_lanes,\
+   neon_vld2_4_regs,\
+   neon_vld3_vld4,\
+   neon_vst1_1_2_regs_vst2_2_regs,\
+   neon_vst1_3_4_regs,\
+   neon_vst2_4_regs_vst3_vst4,\
+   neon_vst3_vst4,\
+   neon_vld1_vld2_lane,\
+   neon_vld3_vld4_lane,\
+   neon_vst1_vst2_lane,\
+   neon_vst3_vst4_lane,\
+   neon_vld3_vld4_all_lanes,\
+   neon_mcr,\
+   neon_mcr_2_mcrr,\
+   neon_mrc,\
+   neon_mrrc,\
+   neon_ldm_2,\
+   neon_stm_2,\
+   none"
+ (const_string "none"))
+
+;; Predicates used for setting the above attribute.
+
+(define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
+				 (V4HI "false") (V8HI "false")
+				 (V2SI "false") (V4SI "false")
+				 (V2SF "true") (V4SF "true")
+				 (DI "false") (V2DI "false")])
+
+(define_mode_attr Scalar_mul_8_16 [(V8QI "true") (V16QI "true")
+				   (V4HI "true") (V8HI "true")
+				   (V2SI "false") (V4SI "false")
+				   (V2SF "false") (V4SF "false")
+				   (DI "false") (V2DI "false")])
+
+
+(define_mode_attr Is_d_reg [(V8QI "true") (V16QI "false")
+                            (V4HI "true") (V8HI  "false")
+                            (V2SI "true") (V4SI  "false")
+                            (V2SF "true") (V4SF  "false")
+                            (DI   "true") (V2DI  "false")])
+
+(define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
+                                 (V4HI "4") (V8HI "8")
+                                 (V2SI "2") (V4SI "4")
+                                 (V2SF "2") (V4SF "4")
+                                 (DI "1")   (V2DI "2")])
+
+;; FIXME: Attributes are probably borked.
+(define_insn "*neon_mov<mode>"
+  [(set (match_operand:VD 0 "nonimmediate_operand"
+	  "=w,Uv,w, w,  ?r,?w,?r,?r, ?Us")
+	(match_operand:VD 1 "general_operand"
+	  " w,w, Dn,Uvi, w, r, r, Usi,r"))]
+  "TARGET_NEON"
+{
+  if (which_alternative == 2)
+    {
+      int width, is_valid;
+      static char templ[40];
+
+      is_valid = neon_immediate_valid_for_move (operands[1], <MODE>mode,
+        &operands[1], &width);
+
+      gcc_assert (is_valid != 0);
+
+      if (width == 0)
+        return "vmov.f32\t%P0, %1  @ <mode>";
+      else
+        sprintf (templ, "vmov.i%d\t%%P0, %%1  @ <mode>", width);
+
+      return templ;
+    }
+
+  /* FIXME: If the memory layout is changed in big-endian mode, output_move_vfp
+     below must be changed to output_move_neon (which will use the
+     element/structure loads/stores), and the constraint changed to 'Un' instead
+     of 'Uv'.  */
+
+  switch (which_alternative)
+    {
+    case 0: return "vmov\t%P0, %P1  @ <mode>";
+    case 1: case 3: return output_move_vfp (operands);
+    case 2: gcc_unreachable ();
+    case 4: return "vmov\t%Q0, %R0, %P1  @ <mode>";
+    case 5: return "vmov\t%P0, %Q1, %R1  @ <mode>";
+    default: return output_move_double (operands);
+    }
+}
+ [(set_attr "neon_type" "neon_int_1,*,neon_vmov,*,neon_mrrc,neon_mcr_2_mcrr,*,*,*")
+  (set_attr "type" "*,f_stored,*,f_loadd,*,*,alu,load2,store2")
+  (set_attr "insn" "*,*,*,*,*,*,mov,*,*")
+  (set_attr "length" "4,4,4,4,4,4,8,8,8")
+  (set_attr "pool_range"     "*,*,*,1020,*,*,*,1020,*")
+  (set_attr "neg_pool_range" "*,*,*,1008,*,*,*,1008,*")])
+
+(define_insn "*neon_mov<mode>"
+  [(set (match_operand:VQXMOV 0 "nonimmediate_operand"
+  	  "=w,Un,w, w,  ?r,?w,?r,?r,  ?Us")
+	(match_operand:VQXMOV 1 "general_operand"
+	  " w,w, Dn,Uni, w, r, r, Usi, r"))]
+  "TARGET_NEON"
+{
+  if (which_alternative == 2)
+    {
+      int width, is_valid;
+      static char templ[40];
+      
+      is_valid = neon_immediate_valid_for_move (operands[1], <MODE>mode,
+        &operands[1], &width);
+      
+      gcc_assert (is_valid != 0);
+      
+      if (width == 0)
+        return "vmov.f32\t%q0, %1  @ <mode>";
+      else
+        sprintf (templ, "vmov.i%d\t%%q0, %%1  @ <mode>", width);
+      
+      return templ;
+    }
+  
+  switch (which_alternative)
+    {
+    case 0: return "vmov\t%q0, %q1  @ <mode>";
+    case 1: case 3: return output_move_neon (operands);
+    case 2: gcc_unreachable ();
+    case 4: return "vmov\t%Q0, %R0, %e1  @ <mode>\;vmov\t%J0, %K0, %f1";
+    case 5: return "vmov\t%e0, %Q1, %R1  @ <mode>\;vmov\t%f0, %J1, %K1";
+    default: return output_move_quad (operands);
+    }
+}
+  [(set_attr "neon_type" "neon_int_1,neon_stm_2,neon_vmov,neon_ldm_2,\
+                          neon_mrrc,neon_mcr_2_mcrr,*,*,*")
+   (set_attr "type" "*,*,*,*,*,*,alu,load4,store4")
+   (set_attr "insn" "*,*,*,*,*,*,mov,*,*")
+   (set_attr "length" "4,8,4,8,8,8,16,8,16")
+   (set_attr "pool_range" "*,*,*,1020,*,*,*,1020,*")
+   (set_attr "neg_pool_range" "*,*,*,1008,*,*,*,1008,*")])
+
+(define_expand "movti"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "")
+	(match_operand:TI 1 "general_operand" ""))]
+  "TARGET_NEON"
+{
+})
+
+(define_expand "mov<mode>"
+  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "")
+	(match_operand:VSTRUCT 1 "general_operand" ""))]
+  "TARGET_NEON"
+{
+})
+
+;; APPLE LOCAL begin 6160917
+(define_expand "reload_in<mode>"
+  [(parallel [(match_operand:VDQW 0 "s_register_operand" "=w")
+	      (match_operand:VDQW 1 "neon_reload_mem_operand" "m")
+	      (match_operand:SI   2 "s_register_operand" "=&r")])]
+  "TARGET_NEON"
+  "
+{
+  neon_reload_in (operands, <MODE>mode);
+  DONE;
+}")
+
+(define_expand "reload_out<mode>"
+  [(parallel [(match_operand:VDQW 0 "neon_reload_mem_operand" "=m")
+	      (match_operand:VDQW 1 "s_register_operand" "w")
+	      (match_operand:SI   2 "s_register_operand" "=&r")])]
+  "TARGET_NEON"
+  "
+{
+  neon_reload_out (operands, <MODE>mode);
+  DONE;
+}")
+;; APPLE LOCAL end 6160917
+
+(define_insn "*neon_mov<mode>"
+  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
+	(match_operand:VSTRUCT 1 "general_operand"	" w,w, Ut"))]
+  "TARGET_NEON"
+{
+  switch (which_alternative)
+    {
+    case 0: return "#";
+    case 1: case 2: return output_move_neon (operands);
+    default: gcc_unreachable ();
+    }
+}
+  [(set_attr "length" "<V_slen>,<V_slen>,<V_slen>")])
+
+(define_split
+  [(set (match_operand:EI 0 "s_register_operand" "")
+	(match_operand:EI 1 "s_register_operand" ""))]
+  "TARGET_NEON && reload_completed"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))]
+{
+  int rdest = REGNO (operands[0]);
+  int rsrc = REGNO (operands[1]);
+  rtx dest[2], src[2];
+  
+  dest[0] = gen_rtx_REG (TImode, rdest);
+  src[0] = gen_rtx_REG (TImode, rsrc);
+  dest[1] = gen_rtx_REG (DImode, rdest + 4);
+  src[1] = gen_rtx_REG (DImode, rsrc + 4);
+  
+  neon_disambiguate_copy (operands, dest, src, 2);
+})
+
+(define_split
+  [(set (match_operand:OI 0 "s_register_operand" "")
+	(match_operand:OI 1 "s_register_operand" ""))]
+  "TARGET_NEON && reload_completed"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))]
+{
+  int rdest = REGNO (operands[0]);
+  int rsrc = REGNO (operands[1]);
+  rtx dest[2], src[2];
+  
+  dest[0] = gen_rtx_REG (TImode, rdest);
+  src[0] = gen_rtx_REG (TImode, rsrc);
+  dest[1] = gen_rtx_REG (TImode, rdest + 4);
+  src[1] = gen_rtx_REG (TImode, rsrc + 4);
+  
+  neon_disambiguate_copy (operands, dest, src, 2);
+})
+
+(define_split
+  [(set (match_operand:CI 0 "s_register_operand" "")
+	(match_operand:CI 1 "s_register_operand" ""))]
+  "TARGET_NEON && reload_completed"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))]
+{
+  int rdest = REGNO (operands[0]);
+  int rsrc = REGNO (operands[1]);
+  rtx dest[3], src[3];
+  
+  dest[0] = gen_rtx_REG (TImode, rdest);
+  src[0] = gen_rtx_REG (TImode, rsrc);
+  dest[1] = gen_rtx_REG (TImode, rdest + 4);
+  src[1] = gen_rtx_REG (TImode, rsrc + 4);
+  dest[2] = gen_rtx_REG (TImode, rdest + 8);
+  src[2] = gen_rtx_REG (TImode, rsrc + 8);
+  
+  neon_disambiguate_copy (operands, dest, src, 3);
+})
+
+(define_split
+  [(set (match_operand:XI 0 "s_register_operand" "")
+	(match_operand:XI 1 "s_register_operand" ""))]
+  "TARGET_NEON && reload_completed"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))
+   (set (match_dup 6) (match_dup 7))]
+{
+  int rdest = REGNO (operands[0]);
+  int rsrc = REGNO (operands[1]);
+  rtx dest[4], src[4];
+  
+  dest[0] = gen_rtx_REG (TImode, rdest);
+  src[0] = gen_rtx_REG (TImode, rsrc);
+  dest[1] = gen_rtx_REG (TImode, rdest + 4);
+  src[1] = gen_rtx_REG (TImode, rsrc + 4);
+  dest[2] = gen_rtx_REG (TImode, rdest + 8);
+  src[2] = gen_rtx_REG (TImode, rsrc + 8);
+  dest[3] = gen_rtx_REG (TImode, rdest + 12);
+  src[3] = gen_rtx_REG (TImode, rsrc + 12);
+  
+  neon_disambiguate_copy (operands, dest, src, 4);
+})
+
+; FIXME: Set/extract/init quads.
+
+(define_insn "vec_set<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "+w")
+        (vec_merge:VD
+          (match_operand:VD 3 "s_register_operand" "0")
+          (vec_duplicate:VD
+            (match_operand:<V_elem> 1 "s_register_operand" "r"))
+          (ashift:SI (const_int 1)
+                     (match_operand:SI 2 "immediate_operand" "i"))))]
+  "TARGET_NEON"
+  "vmov%?.<V_uf_sclr>\t%P0[%c2], %1"
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_mcr")]
+)
+
+(define_insn "vec_set<mode>"
+  [(set (match_operand:VQ 0 "s_register_operand" "+w")
+        (vec_merge:VQ
+          (match_operand:VQ 3 "s_register_operand" "0")
+          (vec_duplicate:VQ
+            (match_operand:<V_elem> 1 "s_register_operand" "r"))
+          (ashift:SI (const_int 1)
+		     (match_operand:SI 2 "immediate_operand" "i"))))]
+  "TARGET_NEON"
+{
+  int half_elts = GET_MODE_NUNITS (<MODE>mode) / 2;
+  int elt = INTVAL (operands[2]) % half_elts;
+  int hi = (INTVAL (operands[2]) / half_elts) * 2;
+  int regno = REGNO (operands[0]);
+  
+  operands[0] = gen_rtx_REG (<V_HALF>mode, regno + hi);
+  operands[2] = GEN_INT (elt);
+  
+  return "vmov%?.<V_uf_sclr>\t%P0[%c2], %1";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_mcr")]
+)
+
+(define_insn "vec_setv2di"
+  [(set (match_operand:V2DI 0 "s_register_operand" "+w")
+        (vec_merge:V2DI
+          (match_operand:V2DI 3 "s_register_operand" "0")
+          (vec_duplicate:V2DI
+            (match_operand:DI 1 "s_register_operand" "r"))
+          (ashift:SI (const_int 1)
+		     (match_operand:SI 2 "immediate_operand" "i"))))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[0]) + INTVAL (operands[2]);
+  
+  operands[0] = gen_rtx_REG (DImode, regno);
+  
+  return "vmov%?.64\t%P0, %Q1, %R1";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_mcr_2_mcrr")]
+)
+
+(define_insn "vec_extract<mode>"
+  [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+        (vec_select:<V_elem>
+          (match_operand:VD 1 "s_register_operand" "w")
+          (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
+  "TARGET_NEON"
+  "vmov%?.<V_uf_sclr>\t%0, %P1[%c2]"
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "vec_extract<mode>"
+  [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+	(vec_select:<V_elem>
+          (match_operand:VQ 1 "s_register_operand" "w")
+          (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
+  "TARGET_NEON"
+{
+  int half_elts = GET_MODE_NUNITS (<MODE>mode) / 2;
+  int elt = INTVAL (operands[2]) % half_elts;
+  int hi = (INTVAL (operands[2]) / half_elts) * 2;
+  int regno = REGNO (operands[1]);
+  
+  operands[1] = gen_rtx_REG (<V_HALF>mode, regno + hi);
+  operands[2] = GEN_INT (elt);
+  
+  return "vmov%?.<V_uf_sclr>\t%0, %P1[%c2]";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "vec_extractv2di"
+  [(set (match_operand:DI 0 "s_register_operand" "=r")
+	(vec_select:DI
+          (match_operand:V2DI 1 "s_register_operand" "w")
+          (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[1]) + INTVAL (operands[2]);
+  
+  operands[1] = gen_rtx_REG (DImode, regno);
+  
+  return "vmov%?.64\t%Q0, %R0, %P1";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_int_1")]
+)
+
+(define_expand "vec_init<mode>"
+  [(match_operand:VDQ 0 "s_register_operand" "")
+   (match_operand 1 "" "")]
+  "TARGET_NEON"
+{
+  neon_expand_vector_init (operands[0], operands[1]);
+  DONE;
+})
+
+;; Doubleword and quadword arithmetic.
+
+;; NOTE: vadd/vsub and some other instructions also support 64-bit integer
+;; element size, which we could potentially use for "long long" operations. We
+;; don't want to do this at present though, because moving values from the
+;; vector unit to the ARM core is currently slow and 64-bit addition (etc.) is
+;; easy to do with ARM instructions anyway.
+
+(define_insn "*add<mode>3_neon"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+        (plus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+		  (match_operand:VDQ 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_1")))]
+)
+
+(define_insn "*sub<mode>3_neon"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+        (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+                   (match_operand:VDQ 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_2")))]
+)
+
+(define_insn "*mul<mode>3_neon"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+        (mult:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+                  (match_operand:VDQ 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vmul.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (if_then_else
+                                    (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+                                    (const_string "neon_mul_qqq_8_16_32_ddd_32"))
+                                  (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mul_qqq_8_16_32_ddd_32")
+                                    (const_string "neon_mul_qqq_8_16_32_ddd_32")))))]
+)
+
+(define_insn "ior<mode>3"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
+	(ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
+		 (match_operand:VDQ 2 "neon_logic_op2" "w,Dl")))]
+  "TARGET_NEON"
+{
+  switch (which_alternative)
+    {
+    case 0: return "vorr\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
+    case 1: return neon_output_logic_immediate ("vorr", &operands[2],
+		     <MODE>mode, 0, VALID_NEON_QREG_MODE (<MODE>mode));
+    default: gcc_unreachable ();
+    }
+}
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "iordi3_neon"
+  [(set (match_operand:DI 0 "s_register_operand" "=w,w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "w,0")
+		    (match_operand:DI 2 "neon_logic_op2" "w,Dl")]
+                    UNSPEC_VORR))]
+  "TARGET_NEON"
+{
+  switch (which_alternative)
+    {
+    case 0: return "vorr\t%P0, %P1, %P2";
+    case 1: return neon_output_logic_immediate ("vorr", &operands[2],
+		     DImode, 0, VALID_NEON_QREG_MODE (DImode));
+    default: gcc_unreachable ();
+    }
+}
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+;; The concrete forms of the Neon immediate-logic instructions are vbic and
+;; vorr. We support the pseudo-instruction vand instead, because that
+;; corresponds to the canonical form the middle-end expects to use for
+;; immediate bitwise-ANDs.
+
+(define_insn "and<mode>3"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
+	(and:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
+		 (match_operand:VDQ 2 "neon_inv_logic_op2" "w,DL")))]
+  "TARGET_NEON"
+{
+  switch (which_alternative)
+    {
+    case 0: return "vand\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
+    case 1: return neon_output_logic_immediate ("vand", &operands[2],
+    		     <MODE>mode, 1, VALID_NEON_QREG_MODE (<MODE>mode));
+    default: gcc_unreachable ();
+    }
+}
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "anddi3_neon"
+  [(set (match_operand:DI 0 "s_register_operand" "=w,w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "w,0")
+		    (match_operand:DI 2 "neon_inv_logic_op2" "w,DL")]
+                    UNSPEC_VAND))]
+  "TARGET_NEON"
+{
+  switch (which_alternative)
+    {
+    case 0: return "vand\t%P0, %P1, %P2";
+    case 1: return neon_output_logic_immediate ("vand", &operands[2],
+    		     DImode, 1, VALID_NEON_QREG_MODE (DImode));
+    default: gcc_unreachable ();
+    }
+}
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "orn<mode>3_neon"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+	(ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+		 (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))))]
+  "TARGET_NEON"
+  "vorn\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "orndi3_neon"
+  [(set (match_operand:DI 0 "s_register_operand" "=w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+		    (match_operand:DI 2 "s_register_operand" "w")]
+                    UNSPEC_VORN))]
+  "TARGET_NEON"
+  "vorn\t%P0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "bic<mode>3_neon"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+	(and:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+		  (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))))]
+  "TARGET_NEON"
+  "vbic\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "bicdi3_neon"
+  [(set (match_operand:DI 0 "s_register_operand" "=w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+		     (match_operand:DI 2 "s_register_operand" "w")]
+                    UNSPEC_VBIC))]
+  "TARGET_NEON"
+  "vbic\t%P0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "xor<mode>3"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+	(xor:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+		 (match_operand:VDQ 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "veor\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "xordi3_neon"
+  [(set (match_operand:DI 0 "s_register_operand" "=w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+		     (match_operand:DI 2 "s_register_operand" "w")]
+                    UNSPEC_VEOR))]
+  "TARGET_NEON"
+  "veor\t%P0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "one_cmpl<mode>2"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+        (not:VDQ (match_operand:VDQ 1 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vmvn\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "abs<mode>2"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(abs:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_3")))]
+)
+
+(define_insn "neg<mode>2"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(neg:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vneg.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_3")))]
+)
+
+(define_insn "*umin<mode>3_neon"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+	(umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
+		    (match_operand:VDQIW 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vmin.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "*umax<mode>3_neon"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+	(umax:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
+		    (match_operand:VDQIW 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vmax.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "*smin<mode>3_neon"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(smin:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
+		   (match_operand:VDQW 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vmin.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (const_string "neon_fp_vadd_ddd_vabs_dd")
+                    (const_string "neon_int_5")))]
+)
+
+(define_insn "*smax<mode>3_neon"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(smax:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
+		   (match_operand:VDQW 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vmax.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (const_string "neon_fp_vadd_ddd_vabs_dd")
+                    (const_string "neon_int_5")))]
+)
+
+; TODO: V2DI shifts are current disabled because there are bugs in the
+; generic vectorizer code.  It ends up creating a V2DI constructor with
+; SImode elements.
+
+(define_insn "ashl<mode>3"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+	(ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
+		      (match_operand:VDQIW 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_vshl_ddd")
+                    (const_string "neon_shift_3")))]
+)
+
+; Used for implementing logical shift-right, which is a left-shift by a negative
+; amount, with signed operands. This is essentially the same as ashl<mode>3
+; above, but using an unspec in case GCC tries anything tricky with negative
+; shift amounts.
+
+(define_insn "ashl<mode>3_signed"
+  [(set (match_operand:VDQI 0 "s_register_operand" "=w")
+	(unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w")
+		      (match_operand:VDQI 2 "s_register_operand" "w")]
+		     UNSPEC_ASHIFT_SIGNED))]
+  "TARGET_NEON"
+  "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_vshl_ddd")
+                    (const_string "neon_shift_3")))]
+)
+
+; Used for implementing logical shift-right, which is a left-shift by a negative
+; amount, with unsigned operands.
+
+(define_insn "ashl<mode>3_unsigned"
+  [(set (match_operand:VDQI 0 "s_register_operand" "=w")
+	(unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w")
+		      (match_operand:VDQI 2 "s_register_operand" "w")]
+		     UNSPEC_ASHIFT_UNSIGNED))]
+  "TARGET_NEON"
+  "vshl.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_vshl_ddd")
+                    (const_string "neon_shift_3")))]
+)
+
+(define_expand "ashr<mode>3"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "")
+	(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
+			(match_operand:VDQIW 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+{
+  rtx neg = gen_reg_rtx (<MODE>mode);
+
+  emit_insn (gen_neg<mode>2 (neg, operands[2]));
+  emit_insn (gen_ashl<mode>3_signed (operands[0], operands[1], neg));
+
+  DONE;
+})
+
+(define_expand "lshr<mode>3"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "")
+	(lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
+			(match_operand:VDQIW 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+{
+  rtx neg = gen_reg_rtx (<MODE>mode);
+
+  emit_insn (gen_neg<mode>2 (neg, operands[2]));
+  emit_insn (gen_ashl<mode>3_unsigned (operands[0], operands[1], neg));
+
+  DONE;
+})
+
+;; Widening operations
+
+;; FIXME: I'm not sure if sign/zero_extend are legal to use on vector modes.
+
+(define_insn "widen_ssum<mode>3"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(plus:<V_widen> (sign_extend:<V_widen>
+			  (match_operand:VW 1 "s_register_operand" "%w"))
+		        (match_operand:<V_widen> 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q2, %P1"
+  [(set_attr "neon_type" "neon_int_3")]
+)
+
+(define_insn "widen_usum<mode>3"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(plus:<V_widen> (zero_extend:<V_widen>
+			  (match_operand:VW 1 "s_register_operand" "%w"))
+		        (match_operand:<V_widen> 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q2, %P1"
+  [(set_attr "neon_type" "neon_int_3")]
+)
+
+;; VEXT can be used to synthesize coarse whole-vector shifts with 8-bit
+;; shift-count granularity. That's good enough for the middle-end's current
+;; needs.
+
+(define_expand "vec_shr_<mode>"
+  [(match_operand:VDQ 0 "s_register_operand" "")
+   (match_operand:VDQ 1 "s_register_operand" "")
+   (match_operand:SI 2 "const_multiple_of_8_operand" "")]
+  "TARGET_NEON"
+{
+  rtx zero_reg;
+  HOST_WIDE_INT num_bits = INTVAL (operands[2]);
+  const int width = GET_MODE_BITSIZE (<MODE>mode);
+  const enum machine_mode bvecmode = (width == 128) ? V16QImode : V8QImode;
+  rtx (*gen_ext) (rtx, rtx, rtx, rtx) =
+    (width == 128) ? gen_neon_vextv16qi : gen_neon_vextv8qi;
+
+  if (num_bits == width)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+  
+  zero_reg = force_reg (bvecmode, CONST0_RTX (bvecmode));
+  operands[0] = gen_lowpart (bvecmode, operands[0]);
+  operands[1] = gen_lowpart (bvecmode, operands[1]);
+  
+  emit_insn (gen_ext (operands[0], operands[1], zero_reg,
+		      GEN_INT (num_bits / BITS_PER_UNIT)));
+  DONE;
+})
+
+(define_expand "vec_shl_<mode>"
+  [(match_operand:VDQ 0 "s_register_operand" "")
+   (match_operand:VDQ 1 "s_register_operand" "")
+   (match_operand:SI 2 "const_multiple_of_8_operand" "")]
+  "TARGET_NEON"
+{
+  rtx zero_reg;
+  HOST_WIDE_INT num_bits = INTVAL (operands[2]);
+  const int width = GET_MODE_BITSIZE (<MODE>mode);
+  const enum machine_mode bvecmode = (width == 128) ? V16QImode : V8QImode;
+  rtx (*gen_ext) (rtx, rtx, rtx, rtx) =
+    (width == 128) ? gen_neon_vextv16qi : gen_neon_vextv8qi;
+  
+  if (num_bits == 0)
+    {
+      emit_move_insn (operands[0], CONST0_RTX (<MODE>mode));
+      DONE;
+    }
+  
+  num_bits = width - num_bits;
+  
+  zero_reg = force_reg (bvecmode, CONST0_RTX (bvecmode));
+  operands[0] = gen_lowpart (bvecmode, operands[0]);
+  operands[1] = gen_lowpart (bvecmode, operands[1]);
+  
+  emit_insn (gen_ext (operands[0], zero_reg, operands[1],
+		      GEN_INT (num_bits / BITS_PER_UNIT)));
+  DONE;
+})
+
+;; Helpers for quad-word reduction operations
+
+; Add (or smin, smax...) the low N/2 elements of the N-element vector
+; operand[1] to the high N/2 elements of same. Put the result in operand[0], an
+; N/2-element vector.
+
+(define_insn "quad_halves_<code>v4si"
+  [(set (match_operand:V2SI 0 "s_register_operand" "=w")
+        (vqh_ops:V2SI
+          (vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w")
+                           (parallel [(const_int 0) (const_int 1)]))
+          (vec_select:V2SI (match_dup 1)
+                           (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_NEON"
+  "<VQH_mnem>.<VQH_sign>32\t%P0, %e1, %f1"
+  [(set_attr "vqh_mnem" "<VQH_mnem>")
+   (set (attr "neon_type")
+      (if_then_else (eq_attr "vqh_mnem" "vadd")
+                    (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+(define_insn "quad_halves_<code>v4sf"
+  [(set (match_operand:V2SF 0 "s_register_operand" "=w")
+        (vqhs_ops:V2SF
+          (vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w")
+                           (parallel [(const_int 0) (const_int 1)]))
+          (vec_select:V2SF (match_dup 1)
+                           (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_NEON"
+  "<VQH_mnem>.f32\t%P0, %e1, %f1"
+  [(set_attr "vqh_mnem" "<VQH_mnem>")
+   (set (attr "neon_type")
+      (if_then_else (eq_attr "vqh_mnem" "vadd")
+                    (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+(define_insn "quad_halves_<code>v8hi"
+  [(set (match_operand:V4HI 0 "s_register_operand" "+w")
+        (vqh_ops:V4HI
+          (vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w")
+                           (parallel [(const_int 0) (const_int 1)
+				      (const_int 2) (const_int 3)]))
+          (vec_select:V4HI (match_dup 1)
+                           (parallel [(const_int 4) (const_int 5)
+				      (const_int 6) (const_int 7)]))))]
+  "TARGET_NEON"
+  "<VQH_mnem>.<VQH_sign>16\t%P0, %e1, %f1"
+  [(set_attr "vqh_mnem" "<VQH_mnem>")
+   (set (attr "neon_type")
+      (if_then_else (eq_attr "vqh_mnem" "vadd")
+                    (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+(define_insn "quad_halves_<code>v16qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "+w")
+        (vqh_ops:V8QI
+          (vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w")
+                           (parallel [(const_int 0) (const_int 1)
+				      (const_int 2) (const_int 3)
+				      (const_int 4) (const_int 5)
+				      (const_int 6) (const_int 7)]))
+          (vec_select:V8QI (match_dup 1)
+                           (parallel [(const_int 8) (const_int 9)
+				      (const_int 10) (const_int 11)
+				      (const_int 12) (const_int 13)
+				      (const_int 14) (const_int 15)]))))]
+  "TARGET_NEON"
+  "<VQH_mnem>.<VQH_sign>8\t%P0, %e1, %f1"
+  [(set_attr "vqh_mnem" "<VQH_mnem>")
+   (set (attr "neon_type")
+      (if_then_else (eq_attr "vqh_mnem" "vadd")
+                    (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+; FIXME: We wouldn't need the following insns if we could write subregs of
+; vector registers. Make an attempt at removing unnecessary moves, though
+; we're really at the mercy of the register allocator.
+
+(define_insn "move_lo_quad_v4si"
+  [(set (match_operand:V4SI 0 "s_register_operand" "+w")
+        (vec_concat:V4SI
+          (match_operand:V2SI 1 "s_register_operand" "w")
+          (vec_select:V2SI (match_dup 0)
+			   (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_NEON"
+{
+  int dest = REGNO (operands[0]);
+  int src = REGNO (operands[1]);
+  
+  if (dest != src)
+    return "vmov\t%e0, %P1";
+  else
+    return "";
+}
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "move_lo_quad_v4sf"
+  [(set (match_operand:V4SF 0 "s_register_operand" "+w")
+        (vec_concat:V4SF
+          (match_operand:V2SF 1 "s_register_operand" "w")
+          (vec_select:V2SF (match_dup 0)
+			   (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_NEON"
+{
+  int dest = REGNO (operands[0]);
+  int src = REGNO (operands[1]);
+  
+  if (dest != src)
+    return "vmov\t%e0, %P1";
+  else
+    return "";
+}
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "move_lo_quad_v8hi"
+  [(set (match_operand:V8HI 0 "s_register_operand" "+w")
+        (vec_concat:V8HI
+          (match_operand:V4HI 1 "s_register_operand" "w")
+          (vec_select:V4HI (match_dup 0)
+                           (parallel [(const_int 4) (const_int 5)
+                                      (const_int 6) (const_int 7)]))))]
+  "TARGET_NEON"
+{
+  int dest = REGNO (operands[0]);
+  int src = REGNO (operands[1]);
+  
+  if (dest != src)
+    return "vmov\t%e0, %P1";
+  else
+    return "";
+}
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "move_lo_quad_v16qi"
+  [(set (match_operand:V16QI 0 "s_register_operand" "+w")
+        (vec_concat:V16QI
+          (match_operand:V8QI 1 "s_register_operand" "w")
+          (vec_select:V8QI (match_dup 0)
+                           (parallel [(const_int 8)  (const_int 9)
+                                      (const_int 10) (const_int 11)
+                                      (const_int 12) (const_int 13)
+                                      (const_int 14) (const_int 15)]))))]
+  "TARGET_NEON"
+{
+  int dest = REGNO (operands[0]);
+  int src = REGNO (operands[1]);
+  
+  if (dest != src)
+    return "vmov\t%e0, %P1";
+  else
+    return "";
+}
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+;; Reduction operations
+
+(define_expand "reduc_splus_<mode>"
+  [(match_operand:VD 0 "s_register_operand" "")
+   (match_operand:VD 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+			&gen_neon_vpadd_internal<mode>);
+  DONE;
+})
+
+(define_expand "reduc_splus_<mode>"
+  [(match_operand:VQ 0 "s_register_operand" "")
+   (match_operand:VQ 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  rtx step1 = gen_reg_rtx (<V_HALF>mode);
+  rtx res_d = gen_reg_rtx (<V_HALF>mode);
+  
+  emit_insn (gen_quad_halves_plus<mode> (step1, operands[1]));
+  emit_insn (gen_reduc_splus_<V_half> (res_d, step1));
+  emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+
+  DONE;
+})
+
+(define_insn "reduc_splus_v2di"
+  [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+	(unspec:V2DI [(match_operand:V2DI 1 "s_register_operand" "w")]
+		     UNSPEC_VPADD))]
+  "TARGET_NEON"
+  "vadd.i64\t%e0, %e1, %f1"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+;; NEON does not distinguish between signed and unsigned addition except on
+;; widening operations.
+(define_expand "reduc_uplus_<mode>"
+  [(match_operand:VDQI 0 "s_register_operand" "")
+   (match_operand:VDQI 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_reduc_splus_<mode> (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "reduc_smin_<mode>"
+  [(match_operand:VD 0 "s_register_operand" "")
+   (match_operand:VD 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+			&gen_neon_vpsmin<mode>);
+  DONE;
+})
+
+(define_expand "reduc_smin_<mode>"
+  [(match_operand:VQ 0 "s_register_operand" "")
+   (match_operand:VQ 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  rtx step1 = gen_reg_rtx (<V_HALF>mode);
+  rtx res_d = gen_reg_rtx (<V_HALF>mode);
+  
+  emit_insn (gen_quad_halves_smin<mode> (step1, operands[1]));
+  emit_insn (gen_reduc_smin_<V_half> (res_d, step1));
+  emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+  
+  DONE;
+})
+
+(define_expand "reduc_smax_<mode>"
+  [(match_operand:VD 0 "s_register_operand" "")
+   (match_operand:VD 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+			&gen_neon_vpsmax<mode>);
+  DONE;
+})
+
+(define_expand "reduc_smax_<mode>"
+  [(match_operand:VQ 0 "s_register_operand" "")
+   (match_operand:VQ 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  rtx step1 = gen_reg_rtx (<V_HALF>mode);
+  rtx res_d = gen_reg_rtx (<V_HALF>mode);
+  
+  emit_insn (gen_quad_halves_smax<mode> (step1, operands[1]));
+  emit_insn (gen_reduc_smax_<V_half> (res_d, step1));
+  emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+  
+  DONE;
+})
+
+(define_expand "reduc_umin_<mode>"
+  [(match_operand:VDI 0 "s_register_operand" "")
+   (match_operand:VDI 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+			&gen_neon_vpumin<mode>);
+  DONE;
+})
+
+(define_expand "reduc_umin_<mode>"
+  [(match_operand:VQI 0 "s_register_operand" "")
+   (match_operand:VQI 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  rtx step1 = gen_reg_rtx (<V_HALF>mode);
+  rtx res_d = gen_reg_rtx (<V_HALF>mode);
+  
+  emit_insn (gen_quad_halves_umin<mode> (step1, operands[1]));
+  emit_insn (gen_reduc_umin_<V_half> (res_d, step1));
+  emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+
+  DONE;
+})
+
+(define_expand "reduc_umax_<mode>"
+  [(match_operand:VDI 0 "s_register_operand" "")
+   (match_operand:VDI 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+			&gen_neon_vpumax<mode>);
+  DONE;
+})
+
+(define_expand "reduc_umax_<mode>"
+  [(match_operand:VQI 0 "s_register_operand" "")
+   (match_operand:VQI 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  rtx step1 = gen_reg_rtx (<V_HALF>mode);
+  rtx res_d = gen_reg_rtx (<V_HALF>mode);
+  
+  emit_insn (gen_quad_halves_umax<mode> (step1, operands[1]));
+  emit_insn (gen_reduc_umax_<V_half> (res_d, step1));
+  emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+  
+  DONE;
+})
+
+(define_insn "neon_vpadd_internal<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+	(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+		    (match_operand:VD 2 "s_register_operand" "w")]
+                   UNSPEC_VPADD))]
+  "TARGET_NEON"
+  "vpadd.<V_if_elem>\t%P0, %P1, %P2"
+  ;; Assume this schedules like vadd.
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_1")))]
+)
+
+(define_insn "neon_vpsmin<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+	(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+		    (match_operand:VD 2 "s_register_operand" "w")]
+                   UNSPEC_VPSMIN))]
+  "TARGET_NEON"
+  "vpmin.<V_s_elem>\t%P0, %P1, %P2"
+  ;; Assume this schedules like vmin.
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (const_string "neon_fp_vadd_ddd_vabs_dd")
+                    (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vpsmax<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+	(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+		    (match_operand:VD 2 "s_register_operand" "w")]
+                   UNSPEC_VPSMAX))]
+  "TARGET_NEON"
+  "vpmax.<V_s_elem>\t%P0, %P1, %P2"
+  ;; Assume this schedules like vmax.
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (const_string "neon_fp_vadd_ddd_vabs_dd")
+                    (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vpumin<mode>"
+  [(set (match_operand:VDI 0 "s_register_operand" "=w")
+	(unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w")
+		     (match_operand:VDI 2 "s_register_operand" "w")]
+                   UNSPEC_VPUMIN))]
+  "TARGET_NEON"
+  "vpmin.<V_u_elem>\t%P0, %P1, %P2"
+  ;; Assume this schedules like umin.
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vpumax<mode>"
+  [(set (match_operand:VDI 0 "s_register_operand" "=w")
+	(unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w")
+		     (match_operand:VDI 2 "s_register_operand" "w")]
+                   UNSPEC_VPUMAX))]
+  "TARGET_NEON"
+  "vpmax.<V_u_elem>\t%P0, %P1, %P2"
+  ;; Assume this schedules like umax.
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+;; Saturating arithmetic
+
+; NOTE: Neon supports many more saturating variants of instructions than the
+; following, but these are all GCC currently understands.
+; FIXME: Actually, GCC doesn't know how to create saturating add/sub by itself
+; yet either, although these patterns may be used by intrinsics when they're
+; added.
+
+(define_insn "*ss_add<mode>_neon"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+       (ss_plus:VD (match_operand:VD 1 "s_register_operand" "w")
+                   (match_operand:VD 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vqadd.<V_s_elem>\t%P0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "*us_add<mode>_neon"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+       (us_plus:VD (match_operand:VD 1 "s_register_operand" "w")
+                   (match_operand:VD 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vqadd.<V_u_elem>\t%P0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "*ss_sub<mode>_neon"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+       (ss_minus:VD (match_operand:VD 1 "s_register_operand" "w")
+                    (match_operand:VD 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vqsub.<V_s_elem>\t%P0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "*us_sub<mode>_neon"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+       (us_minus:VD (match_operand:VD 1 "s_register_operand" "w")
+                    (match_operand:VD 2 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vqsub.<V_u_elem>\t%P0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+; FIXME: These instructions aren't supported in GCC 4.1, but are documented
+; for the current trunk. Uncomment when this code is merged to a GCC version
+; which supports them.
+
+;(define_insn "*ss_neg<mode>_neon"
+;  [(set (match_operand:VD 0 "s_register_operand" "=w")
+;      (ss_neg:VD 1 (match_operand:VD 1 "s_register_operand" "w")))]
+;  "TARGET_NEON"
+;  "vqneg.<V_s_elem>\t%P0, %P1")
+
+;(define_insn "*ss_ashift<mode>_neon"
+;  [(set (match_operand:VD 0 "s_register_operand" "=w")
+;      (ss_ashift:VD (match_operand:VD 1 "s_register_operand" "w")
+;                    (match_operand:VD 2 "s_register_operand" "w")))]
+;  "TARGET_NEON"
+;  "vqshl.<V_s_elem>\t%P0, %P1, %P2")
+
+;; Patterns for builtins.
+
+; good for plain vadd, vaddq.
+
+(define_insn "neon_vadd<mode>"
+  [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+        (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
+		      (match_operand:VDQX 2 "s_register_operand" "w")
+                      (match_operand:SI 3 "immediate_operand" "i")]
+                     UNSPEC_VADD))]
+  "TARGET_NEON"
+  "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_1")))]
+)
+
+; operand 3 represents in bits:
+;  bit 0: signed (vs unsigned).
+;  bit 1: rounding (vs none).
+
+(define_insn "neon_vaddl<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:VDI 1 "s_register_operand" "w")
+		           (match_operand:VDI 2 "s_register_operand" "w")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+                          UNSPEC_VADDL))]
+  "TARGET_NEON"
+  "vaddl.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_3")]
+)
+
+(define_insn "neon_vaddw<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "w")
+		           (match_operand:VDI 2 "s_register_operand" "w")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+                          UNSPEC_VADDW))]
+  "TARGET_NEON"
+  "vaddw.%T3%#<V_sz_elem>\t%q0, %q1, %P2"
+  [(set_attr "neon_type" "neon_int_2")]
+)
+
+; vhadd and vrhadd.
+
+(define_insn "neon_vhadd<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+        (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+		       (match_operand:VDQIW 2 "s_register_operand" "w")
+		       (match_operand:SI 3 "immediate_operand" "i")]
+		      UNSPEC_VHADD))]
+  "TARGET_NEON"
+  "v%O3hadd.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vqadd<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+        (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:VDQIX 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                     UNSPEC_VQADD))]
+  "TARGET_NEON"
+  "vqadd.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vaddhn<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+        (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+		            (match_operand:VN 2 "s_register_operand" "w")
+                            (match_operand:SI 3 "immediate_operand" "i")]
+                           UNSPEC_VADDHN))]
+  "TARGET_NEON"
+  "v%O3addhn.<V_if_elem>\t%P0, %q1, %q2"
+  [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vmul<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+		      (match_operand:VDQW 2 "s_register_operand" "w")
+		      (match_operand:SI 3 "immediate_operand" "i")]
+		     UNSPEC_VMUL))]
+  "TARGET_NEON"
+  "vmul.%F3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (if_then_else
+                                    (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+                                    (const_string "neon_mul_qqq_8_16_32_ddd_32"))
+                                  (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mul_qqq_8_16_32_ddd_32")
+                                    (const_string "neon_mul_qqq_8_16_32_ddd_32")))))]
+)
+
+(define_insn "neon_vmla<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
+		      (match_operand:VDQW 2 "s_register_operand" "w")
+		      (match_operand:VDQW 3 "s_register_operand" "w")
+                     (match_operand:SI 4 "immediate_operand" "i")]
+                    UNSPEC_VMLA))]
+  "TARGET_NEON"
+  "vmla.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vmla_ddd")
+                                  (const_string "neon_fp_vmla_qqq"))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (if_then_else
+                                    (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+                                    (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))
+                                  (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mla_qqq_8_16")
+                                    (const_string "neon_mla_qqq_32_qqd_32_scalar")))))]
+)
+
+(define_insn "neon_vmlal<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+		           (match_operand:VW 2 "s_register_operand" "w")
+		           (match_operand:VW 3 "s_register_operand" "w")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VMLAL))]
+  "TARGET_NEON"
+  "vmlal.%T4%#<V_sz_elem>\t%q0, %P2, %P3"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vmls<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
+		      (match_operand:VDQW 2 "s_register_operand" "w")
+		      (match_operand:VDQW 3 "s_register_operand" "w")
+                     (match_operand:SI 4 "immediate_operand" "i")]
+                    UNSPEC_VMLS))]
+  "TARGET_NEON"
+  "vmls.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vmla_ddd")
+                                  (const_string "neon_fp_vmla_qqq"))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (if_then_else
+                                    (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+                                    (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))
+                                  (if_then_else
+                                    (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                    (const_string "neon_mla_qqq_8_16")
+                                    (const_string "neon_mla_qqq_32_qqd_32_scalar")))))]
+)
+
+(define_insn "neon_vmlsl<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+		           (match_operand:VW 2 "s_register_operand" "w")
+		           (match_operand:VW 3 "s_register_operand" "w")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VMLSL))]
+  "TARGET_NEON"
+  "vmlsl.%T4%#<V_sz_elem>\t%q0, %P2, %P3"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmulh<mode>"
+  [(set (match_operand:VMDQI 0 "s_register_operand" "=w")
+        (unspec:VMDQI [(match_operand:VMDQI 1 "s_register_operand" "w")
+		       (match_operand:VMDQI 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VQDMULH))]
+  "TARGET_NEON"
+  "vq%O3dmulh.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+        (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                      (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+                      (const_string "neon_mul_qqq_8_16_32_ddd_32"))
+        (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                      (const_string "neon_mul_qqq_8_16_32_ddd_32")
+                      (const_string "neon_mul_qqq_8_16_32_ddd_32"))))]
+)
+
+(define_insn "neon_vqdmlal<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+		           (match_operand:VMDI 2 "s_register_operand" "w")
+		           (match_operand:VMDI 3 "s_register_operand" "w")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VQDMLAL))]
+  "TARGET_NEON"
+  "vqdmlal.<V_s_elem>\t%q0, %P2, %P3"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmlsl<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+		           (match_operand:VMDI 2 "s_register_operand" "w")
+		           (match_operand:VMDI 3 "s_register_operand" "w")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VQDMLSL))]
+  "TARGET_NEON"
+  "vqdmlsl.<V_s_elem>\t%q0, %P2, %P3"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vmull<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+		           (match_operand:VW 2 "s_register_operand" "w")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+                          UNSPEC_VMULL))]
+  "TARGET_NEON"
+  "vmull.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+                   (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vqdmull<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
+		           (match_operand:VMDI 2 "s_register_operand" "w")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+                          UNSPEC_VQDMULL))]
+  "TARGET_NEON"
+  "vqdmull.<V_s_elem>\t%q0, %P1, %P2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+                   (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vsub<mode>"
+  [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+        (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
+		      (match_operand:VDQX 2 "s_register_operand" "w")
+                      (match_operand:SI 3 "immediate_operand" "i")]
+                     UNSPEC_VSUB))]
+  "TARGET_NEON"
+  "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_2")))]
+)
+
+(define_insn "neon_vsubl<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:VDI 1 "s_register_operand" "w")
+		           (match_operand:VDI 2 "s_register_operand" "w")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+                          UNSPEC_VSUBL))]
+  "TARGET_NEON"
+  "vsubl.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_2")]
+)
+
+(define_insn "neon_vsubw<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "w")
+		           (match_operand:VDI 2 "s_register_operand" "w")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+			  UNSPEC_VSUBW))]
+  "TARGET_NEON"
+  "vsubw.%T3%#<V_sz_elem>\t%q0, %q1, %P2"
+  [(set_attr "neon_type" "neon_int_2")]
+)
+
+(define_insn "neon_vqsub<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+        (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:VDQIX 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+		      UNSPEC_VQSUB))]
+  "TARGET_NEON"
+  "vqsub.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vhsub<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+        (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+		       (match_operand:VDQIW 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+		      UNSPEC_VHSUB))]
+  "TARGET_NEON"
+  "vhsub.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vsubhn<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+        (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+		            (match_operand:VN 2 "s_register_operand" "w")
+                            (match_operand:SI 3 "immediate_operand" "i")]
+                           UNSPEC_VSUBHN))]
+  "TARGET_NEON"
+  "v%O3subhn.<V_if_elem>\t%P0, %q1, %q2"
+  [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vceq<mode>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+        (unspec:<V_cmp_result> [(match_operand:VDQW 1 "s_register_operand" "w")
+		                (match_operand:VDQW 2 "s_register_operand" "w")
+                                (match_operand:SI 3 "immediate_operand" "i")]
+                               UNSPEC_VCEQ))]
+  "TARGET_NEON"
+  "vceq.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                    (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                  (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                    (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vcge<mode>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+        (unspec:<V_cmp_result> [(match_operand:VDQW 1 "s_register_operand" "w")
+		                (match_operand:VDQW 2 "s_register_operand" "w")
+                                (match_operand:SI 3 "immediate_operand" "i")]
+                               UNSPEC_VCGE))]
+  "TARGET_NEON"
+  "vcge.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                 (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                 (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                   (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vcgt<mode>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+        (unspec:<V_cmp_result> [(match_operand:VDQW 1 "s_register_operand" "w")
+		                (match_operand:VDQW 2 "s_register_operand" "w")
+                                (match_operand:SI 3 "immediate_operand" "i")]
+                               UNSPEC_VCGT))]
+  "TARGET_NEON"
+  "vcgt.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                 (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                 (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                   (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vcage<mode>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+        (unspec:<V_cmp_result> [(match_operand:VCVTF 1 "s_register_operand" "w")
+		                (match_operand:VCVTF 2 "s_register_operand" "w")
+                                (match_operand:SI 3 "immediate_operand" "i")]
+                               UNSPEC_VCAGE))]
+  "TARGET_NEON"
+  "vacge.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                   (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcagt<mode>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+        (unspec:<V_cmp_result> [(match_operand:VCVTF 1 "s_register_operand" "w")
+		                (match_operand:VCVTF 2 "s_register_operand" "w")
+                                (match_operand:SI 3 "immediate_operand" "i")]
+                               UNSPEC_VCAGT))]
+  "TARGET_NEON"
+  "vacgt.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                   (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vtst<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+        (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+		       (match_operand:VDQIW 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+		      UNSPEC_VTST))]
+  "TARGET_NEON"
+  "vtst.<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vabd<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+		      (match_operand:VDQW 2 "s_register_operand" "w")
+		      (match_operand:SI 3 "immediate_operand" "i")]
+		     UNSPEC_VABD))]
+  "TARGET_NEON"
+  "vabd.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                 (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                 (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                   (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vabdl<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+		           (match_operand:VW 2 "s_register_operand" "w")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+                          UNSPEC_VABDL))]
+  "TARGET_NEON"
+  "vabdl.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+  [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vaba<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+        (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "0")
+		       (match_operand:VDQIW 2 "s_register_operand" "w")
+		       (match_operand:VDQIW 3 "s_register_operand" "w")
+                       (match_operand:SI 4 "immediate_operand" "i")]
+		      UNSPEC_VABA))]
+  "TARGET_NEON"
+  "vaba.%T4%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                   (const_string "neon_vaba") (const_string "neon_vaba_qqq")))]
+)
+
+(define_insn "neon_vabal<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+		           (match_operand:VW 2 "s_register_operand" "w")
+		           (match_operand:VW 3 "s_register_operand" "w")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VABAL))]
+  "TARGET_NEON"
+  "vabal.%T4%#<V_sz_elem>\t%q0, %P2, %P3"
+  [(set_attr "neon_type" "neon_vaba")]
+)
+
+(define_insn "neon_vmax<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+		      (match_operand:VDQW 2 "s_register_operand" "w")
+		      (match_operand:SI 3 "immediate_operand" "i")]
+                     UNSPEC_VMAX))]
+  "TARGET_NEON"
+  "vmax.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+    (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                  (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                  (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vmin<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+		      (match_operand:VDQW 2 "s_register_operand" "w")
+		      (match_operand:SI 3 "immediate_operand" "i")]
+                     UNSPEC_VMIN))]
+  "TARGET_NEON"
+  "vmin.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+    (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                  (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                  (const_string "neon_int_5")))]
+)
+
+(define_expand "neon_vpadd<mode>"
+  [(match_operand:VD 0 "s_register_operand" "=w")
+   (match_operand:VD 1 "s_register_operand" "w")
+   (match_operand:VD 2 "s_register_operand" "w")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_neon_vpadd_internal<mode> (operands[0], operands[1],
+					    operands[2]));
+  DONE;
+})
+
+(define_insn "neon_vpaddl<mode>"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
+        (unspec:<V_double_width> [(match_operand:VDQIW 1 "s_register_operand" "w")
+                                  (match_operand:SI 2 "immediate_operand" "i")]
+                                 UNSPEC_VPADDL))]
+  "TARGET_NEON"
+  "vpaddl.%T2%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+  ;; Assume this schedules like vaddl.
+  [(set_attr "neon_type" "neon_int_3")]
+)
+
+(define_insn "neon_vpadal<mode>"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
+        (unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
+                                  (match_operand:VDQIW 2 "s_register_operand" "w")
+                                  (match_operand:SI 3 "immediate_operand" "i")]
+                                 UNSPEC_VPADAL))]
+  "TARGET_NEON"
+  "vpadal.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+  ;; Assume this schedules like vpadd.
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vpmax<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+        (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+		    (match_operand:VD 2 "s_register_operand" "w")
+                    (match_operand:SI 3 "immediate_operand" "i")]
+                   UNSPEC_VPMAX))]
+  "TARGET_NEON"
+  "vpmax.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  ;; Assume this schedules like vmax.
+  [(set (attr "neon_type")
+    (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                  (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vpmin<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+        (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+		    (match_operand:VD 2 "s_register_operand" "w")
+                    (match_operand:SI 3 "immediate_operand" "i")]
+                   UNSPEC_VPMIN))]
+  "TARGET_NEON"
+  "vpmin.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  ;; Assume this schedules like vmin.
+  [(set (attr "neon_type")
+    (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                  (const_string "neon_fp_vadd_ddd_vabs_dd")
+                  (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vrecps<mode>"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+        (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
+		       (match_operand:VCVTF 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VRECPS))]
+  "TARGET_NEON"
+  "vrecps.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_fp_vrecps_vrsqrts_ddd")
+                    (const_string "neon_fp_vrecps_vrsqrts_qqq")))]
+)
+
+(define_insn "neon_vrsqrts<mode>"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+        (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
+		       (match_operand:VCVTF 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VRSQRTS))]
+  "TARGET_NEON"
+  "vrsqrts.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_fp_vrecps_vrsqrts_ddd")
+                    (const_string "neon_fp_vrecps_vrsqrts_qqq")))]
+)
+
+(define_insn "neon_vabs<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+		      (match_operand:SI 2 "immediate_operand" "i")]
+                     UNSPEC_VABS))]
+  "TARGET_NEON"
+  "vabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set (attr "neon_type")
+     (if_then_else (ior (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                        (ne (symbol_ref "<Is_float_mode>") (const_int 0)))
+                   (if_then_else
+                      (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                      (const_string "neon_fp_vadd_ddd_vabs_dd")
+                      (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                   (const_string "neon_vqneg_vqabs")))]
+)
+
+(define_insn "neon_vqabs<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+	(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")]
+		      UNSPEC_VQABS))]
+  "TARGET_NEON"
+  "vqabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_vqneg_vqabs")]
+)
+
+(define_expand "neon_vneg<mode>"
+  [(match_operand:VDQW 0 "s_register_operand" "")
+   (match_operand:VDQW 1 "s_register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_neg<mode>2 (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "neon_vqneg<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+	(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")]
+		      UNSPEC_VQNEG))]
+  "TARGET_NEON"
+  "vqneg.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_vqneg_vqabs")]
+)
+
+(define_insn "neon_vcls<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+	(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")]
+		      UNSPEC_VCLS))]
+  "TARGET_NEON"
+  "vcls.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vclz<mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+	(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")]
+		      UNSPEC_VCLZ))]
+  "TARGET_NEON"
+  "vclz.<V_if_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vcnt<mode>"
+  [(set (match_operand:VE 0 "s_register_operand" "=w")
+	(unspec:VE [(match_operand:VE 1 "s_register_operand" "w")
+                    (match_operand:SI 2 "immediate_operand" "i")]
+                   UNSPEC_VCNT))]
+  "TARGET_NEON"
+  "vcnt.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vrecpe<mode>"
+  [(set (match_operand:V32 0 "s_register_operand" "=w")
+	(unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")
+                     (match_operand:SI 2 "immediate_operand" "i")]
+                    UNSPEC_VRECPE))]
+  "TARGET_NEON"
+  "vrecpe.<V_u_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_fp_vadd_ddd_vabs_dd")
+                    (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vrsqrte<mode>"
+  [(set (match_operand:V32 0 "s_register_operand" "=w")
+	(unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")
+                     (match_operand:SI 2 "immediate_operand" "i")]
+                    UNSPEC_VRSQRTE))]
+  "TARGET_NEON"
+  "vrsqrte.<V_u_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_fp_vadd_ddd_vabs_dd")
+                    (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_expand "neon_vmvn<mode>"
+  [(match_operand:VDQIW 0 "s_register_operand" "")
+   (match_operand:VDQIW 1 "s_register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_one_cmpl<mode>2 (operands[0], operands[1]));
+  DONE;
+})
+
+;; FIXME: 32-bit element sizes are a bit funky (should be output as .32 not
+;; .u32), but the assembler should cope with that.
+
+(define_insn "neon_vget_lane<mode>"
+  [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+	(unspec:<V_elem> [(match_operand:VD 1 "s_register_operand" "w")
+			  (match_operand:SI 2 "immediate_operand" "i")
+                          (match_operand:SI 3 "immediate_operand" "i")]
+                         UNSPEC_VGET_LANE))]
+  "TARGET_NEON"
+  "vmov%?.%t3%#<V_sz_elem>\t%0, %P1[%c2]"
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+; Operand 2 (lane number) is ignored because we can only extract the zeroth lane
+; with this insn. Operand 3 (info word) is ignored because it does nothing
+; useful with 64-bit elements.
+
+(define_insn "neon_vget_lanedi"
+  [(set (match_operand:DI 0 "s_register_operand" "=r")
+       (unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+                   (match_operand:SI 2 "immediate_operand" "i")
+                   (match_operand:SI 3 "immediate_operand" "i")]
+                  UNSPEC_VGET_LANE))]
+  "TARGET_NEON"
+  "vmov%?\t%Q0, %R0, %P1  @ di"
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vget_lane<mode>"
+  [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+       (unspec:<V_elem> [(match_operand:VQ 1 "s_register_operand" "w")
+                         (match_operand:SI 2 "immediate_operand" "i")
+                         (match_operand:SI 3 "immediate_operand" "i")]
+                        UNSPEC_VGET_LANE))]
+  "TARGET_NEON"
+{
+  rtx ops[4];
+  int regno = REGNO (operands[1]);
+  unsigned int halfelts = GET_MODE_NUNITS (<MODE>mode) / 2;
+  unsigned int elt = INTVAL (operands[2]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
+  ops[2] = GEN_INT (elt % halfelts);
+  ops[3] = operands[3];
+  output_asm_insn ("vmov%?.%t3%#<V_sz_elem>\t%0, %P1[%c2]", ops);
+
+  return "";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vget_lanev2di"
+  [(set (match_operand:DI 0 "s_register_operand" "=r")
+       (unspec:DI [(match_operand:V2DI 1 "s_register_operand" "w")
+                   (match_operand:SI 2 "immediate_operand" "i")
+                   (match_operand:SI 3 "immediate_operand" "i")]
+                  UNSPEC_VGET_LANE))]
+  "TARGET_NEON"
+{
+  rtx ops[2];
+  unsigned int regno = REGNO (operands[1]);
+  unsigned int elt = INTVAL (operands[2]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno + 2 * elt);
+  output_asm_insn ("vmov%?\t%Q0, %R0, %P1  @ v2di", ops);
+
+  return "";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vset_lane<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+	(unspec:VD [(match_operand:<V_elem> 1 "s_register_operand" "r")
+		    (match_operand:VD 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")]
+                   UNSPEC_VSET_LANE))]
+  "TARGET_NEON"
+  "vmov%?.<V_sz_elem>\t%P0[%c3], %1"
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+; See neon_vget_lanedi comment for reasons operands 2 & 3 are ignored.
+
+(define_insn "neon_vset_lanedi"
+  [(set (match_operand:DI 0 "s_register_operand" "=w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "r")
+		    (match_operand:DI 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")]
+                   UNSPEC_VSET_LANE))]
+  "TARGET_NEON"
+  "vmov%?\t%P0, %Q1, %R1  @ di"
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vset_lane<mode>"
+  [(set (match_operand:VQ 0 "s_register_operand" "=w")
+	(unspec:VQ [(match_operand:<V_elem> 1 "s_register_operand" "r")
+		    (match_operand:VQ 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")]
+                   UNSPEC_VSET_LANE))]
+  "TARGET_NEON"
+{
+  rtx ops[4];
+  unsigned int regno = REGNO (operands[0]);
+  unsigned int halfelts = GET_MODE_NUNITS (<MODE>mode) / 2;
+  unsigned int elt = INTVAL (operands[3]);
+
+  ops[0] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
+  ops[1] = operands[1];
+  ops[2] = GEN_INT (elt % halfelts);
+  output_asm_insn ("vmov%?.<V_sz_elem>\t%P0[%c2], %1", ops);
+
+  return "";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vset_lanev2di"
+  [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+	(unspec:V2DI [(match_operand:DI 1 "s_register_operand" "r")
+		      (match_operand:V2DI 2 "s_register_operand" "0")
+                      (match_operand:SI 3 "immediate_operand" "i")]
+                   UNSPEC_VSET_LANE))]
+  "TARGET_NEON"
+{
+  rtx ops[2];
+  unsigned int regno = REGNO (operands[0]);
+  unsigned int elt = INTVAL (operands[3]);
+
+  ops[0] = gen_rtx_REG (DImode, regno + 2 * elt);
+  ops[1] = operands[1];
+  output_asm_insn ("vmov%?\t%P0, %Q1, %R1  @ v2di", ops);
+
+  return "";
+}
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_expand "neon_vcreate<mode>"
+  [(match_operand:VDX 0 "s_register_operand" "")
+   (match_operand:DI 1 "general_operand" "")]
+  "TARGET_NEON"
+{
+  rtx src = gen_lowpart (<MODE>mode, operands[1]);
+  emit_move_insn (operands[0], src);
+  DONE;
+})
+
+(define_insn "neon_vdup_n<mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(unspec:VDQW [(match_operand:<V_elem> 1 "s_register_operand" "r")]
+                    UNSPEC_VDUP_N))]
+  "TARGET_NEON"
+  "vdup%?.<V_sz_elem>\t%<V_reg>0, %1"
+  ;; Assume this schedules like vmov.
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_ndi"
+  [(set (match_operand:DI 0 "s_register_operand" "=w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "r")]
+                   UNSPEC_VDUP_N))]
+  "TARGET_NEON"
+  "vmov%?\t%P0, %Q1, %R1"
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_nv2di"
+  [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+	(unspec:V2DI [(match_operand:DI 1 "s_register_operand" "r")]
+                     UNSPEC_VDUP_N))]
+  "TARGET_NEON"
+  "vmov%?\t%e0, %Q1, %R1\;vmov%?\t%f0, %Q1, %R1"
+  [(set_attr "predicable" "yes")
+   (set_attr "length" "8")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_lane<mode>"
+  [(set (match_operand:VD 0 "s_register_operand" "=w")
+	(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+                   UNSPEC_VDUP_LANE))]
+  "TARGET_NEON"
+  "vdup.<V_sz_elem>\t%P0, %P1[%c2]"
+  ;; Assume this schedules like vmov.
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_lane<mode>"
+  [(set (match_operand:VQ 0 "s_register_operand" "=w")
+	(unspec:VQ [(match_operand:<V_HALF> 1 "s_register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+                   UNSPEC_VDUP_LANE))]
+  "TARGET_NEON"
+  "vdup.<V_sz_elem>\t%q0, %P1[%c2]"
+  ;; Assume this schedules like vmov.
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+; Scalar index is ignored, since only zero is valid here.
+(define_expand "neon_vdup_lanedi"
+  [(set (match_operand:DI 0 "s_register_operand" "=w")
+	(unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+                   UNSPEC_VDUP_LANE))]
+  "TARGET_NEON"
+{
+  emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
+; Likewise.
+(define_insn "neon_vdup_lanev2di"
+  [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+	(unspec:V2DI [(match_operand:DI 1 "s_register_operand" "w")
+		      (match_operand:SI 2 "immediate_operand" "i")]
+                     UNSPEC_VDUP_LANE))]
+  "TARGET_NEON"
+  "vmov\t%e0, %P1\;vmov\t%f0, %P1"
+  [(set_attr "length" "8")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
+;; In this insn, operand 1 should be low, and operand 2 the high part of the
+;; dest vector.
+;; FIXME: A different implementation of this builtin could make it much
+;; more likely that we wouldn't actually need to output anything (we could make
+;; it so that the reg allocator puts things in the right places magically
+;; instead). Lack of subregs for vectors makes that tricky though, I think.
+
+(define_insn "neon_vcombine<mode>"
+  [(set (match_operand:<V_DOUBLE> 0 "s_register_operand" "=w")
+	(unspec:<V_DOUBLE> [(match_operand:VDX 1 "s_register_operand" "w")
+			    (match_operand:VDX 2 "s_register_operand" "w")]
+                           UNSPEC_VCOMBINE))]
+  "TARGET_NEON"
+{
+  int dest = REGNO (operands[0]);
+  int src1 = REGNO (operands[1]);
+  int src2 = REGNO (operands[2]);
+  rtx destlo;
+  
+  if (src1 == dest && src2 == dest + 2)
+    return "";
+  else if (src2 == dest && src1 == dest + 2)
+    /* Special case of reversed high/low parts.  */
+    return "vswp\t%P1, %P2";
+  
+  destlo = gen_rtx_REG (<MODE>mode, dest);
+  
+  if (!reg_overlap_mentioned_p (operands[2], destlo))
+    {
+      /* Try to avoid unnecessary moves if part of the result is in the right
+         place already.  */
+      if (src1 != dest)
+        output_asm_insn ("vmov\t%e0, %P1", operands);
+      if (src2 != dest + 2)
+        output_asm_insn ("vmov\t%f0, %P2", operands);
+    }
+  else
+    {
+      if (src2 != dest + 2)
+        output_asm_insn ("vmov\t%f0, %P2", operands);
+      if (src1 != dest)
+        output_asm_insn ("vmov\t%e0, %P1", operands);
+    }
+  
+  return "";
+}
+  ;; We set the neon_type attribute based on the vmov instructions above.
+  [(set_attr "length" "8")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+                           
+(define_insn "neon_vget_high<mode>"
+  [(set (match_operand:<V_HALF> 0 "s_register_operand" "=w")
+	(unspec:<V_HALF> [(match_operand:VQX 1 "s_register_operand" "w")]
+			 UNSPEC_VGET_HIGH))]
+  "TARGET_NEON"
+{
+  int dest = REGNO (operands[0]);
+  int src = REGNO (operands[1]);
+  
+  if (dest != src + 2)
+    return "vmov\t%P0, %f1";
+  else
+    return "";
+}
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vget_low<mode>"
+  [(set (match_operand:<V_HALF> 0 "s_register_operand" "=w")
+	(unspec:<V_HALF> [(match_operand:VQX 1 "s_register_operand" "w")]
+			 UNSPEC_VGET_LOW))]
+  "TARGET_NEON"
+{
+  int dest = REGNO (operands[0]);
+  int src = REGNO (operands[1]);
+  
+  if (dest != src)
+    return "vmov\t%P0, %e1";
+  else
+    return "";
+}
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vcvt<mode>"
+  [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+	(unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
+			   (match_operand:SI 2 "immediate_operand" "i")]
+			  UNSPEC_VCVT))]
+  "TARGET_NEON"
+  "vcvt.%T2%#32.f32\t%<V_reg>0, %<V_reg>1"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                   (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcvt<mode>"
+  [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+	(unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
+			   (match_operand:SI 2 "immediate_operand" "i")]
+			  UNSPEC_VCVT))]
+  "TARGET_NEON"
+  "vcvt.f32.%T2%#32\t%<V_reg>0, %<V_reg>1"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                   (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcvt_n<mode>"
+  [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+	(unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
+			   (match_operand:SI 2 "immediate_operand" "i")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+			  UNSPEC_VCVT_N))]
+  "TARGET_NEON"
+  "vcvt.%T3%#32.f32\t%<V_reg>0, %<V_reg>1, %2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                   (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcvt_n<mode>"
+  [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+	(unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
+			   (match_operand:SI 2 "immediate_operand" "i")
+                           (match_operand:SI 3 "immediate_operand" "i")]
+			  UNSPEC_VCVT_N))]
+  "TARGET_NEON"
+  "vcvt.f32.%T3%#32\t%<V_reg>0, %<V_reg>1, %2"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                   (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vmovn<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+			    (match_operand:SI 2 "immediate_operand" "i")]
+                           UNSPEC_VMOVN))]
+  "TARGET_NEON"
+  "vmovn.<V_if_elem>\t%P0, %q1"
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vqmovn<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+			    (match_operand:SI 2 "immediate_operand" "i")]
+                           UNSPEC_VQMOVN))]
+  "TARGET_NEON"
+  "vqmovn.%T2%#<V_sz_elem>\t%P0, %q1"
+  [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vqmovun<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+			    (match_operand:SI 2 "immediate_operand" "i")]
+                           UNSPEC_VQMOVUN))]
+  "TARGET_NEON"
+  "vqmovun.<V_s_elem>\t%P0, %q1"
+  [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vmovl<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+			   (match_operand:SI 2 "immediate_operand" "i")]
+                          UNSPEC_VMOVL))]
+  "TARGET_NEON"
+  "vmovl.%T2%#<V_sz_elem>\t%q0, %P1"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vmul_lane<mode>"
+  [(set (match_operand:VMD 0 "s_register_operand" "=w")
+	(unspec:VMD [(match_operand:VMD 1 "s_register_operand" "w")
+		     (match_operand:VMD 2 "s_register_operand"
+                                        "<scalar_mul_constraint>")
+                     (match_operand:SI 3 "immediate_operand" "i")
+                     (match_operand:SI 4 "immediate_operand" "i")]
+                    UNSPEC_VMUL_LANE))]
+  "TARGET_NEON"
+  "vmul.<V_if_elem>\t%P0, %P1, %P2[%c3]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (const_string "neon_fp_vmul_ddd")
+                   (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                 (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+                                 (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar"))))]
+)
+
+(define_insn "neon_vmul_lane<mode>"
+  [(set (match_operand:VMQ 0 "s_register_operand" "=w")
+	(unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "w")
+		     (match_operand:<V_HALF> 2 "s_register_operand"
+                                             "<scalar_mul_constraint>")
+                     (match_operand:SI 3 "immediate_operand" "i")
+                     (match_operand:SI 4 "immediate_operand" "i")]
+                    UNSPEC_VMUL_LANE))]
+  "TARGET_NEON"
+  "vmul.<V_if_elem>\t%q0, %q1, %P2[%c3]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (const_string "neon_fp_vmul_qqd")
+                   (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                 (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")
+                                 (const_string "neon_mul_qqd_32_scalar"))))]
+)
+
+(define_insn "neon_vmull_lane<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
+		           (match_operand:VMDI 2 "s_register_operand"
+					       "<scalar_mul_constraint>")
+                           (match_operand:SI 3 "immediate_operand" "i")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VMULL_LANE))]
+  "TARGET_NEON"
+  "vmull.%T4%#<V_sz_elem>\t%q0, %P1, %P2[%c3]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+                   (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vqdmull_lane<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
+		           (match_operand:VMDI 2 "s_register_operand"
+					       "<scalar_mul_constraint>")
+                           (match_operand:SI 3 "immediate_operand" "i")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VQDMULL_LANE))]
+  "TARGET_NEON"
+  "vqdmull.<V_s_elem>\t%q0, %P1, %P2[%c3]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+                   (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vqdmulh_lane<mode>"
+  [(set (match_operand:VMQI 0 "s_register_operand" "=w")
+	(unspec:VMQI [(match_operand:VMQI 1 "s_register_operand" "w")
+		      (match_operand:<V_HALF> 2 "s_register_operand"
+					      "<scalar_mul_constraint>")
+                      (match_operand:SI 3 "immediate_operand" "i")
+                      (match_operand:SI 4 "immediate_operand" "i")]
+                      UNSPEC_VQDMULH_LANE))]
+  "TARGET_NEON"
+  "vq%O4dmulh.%T4%#<V_sz_elem>\t%q0, %q1, %P2[%c3]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")
+                   (const_string "neon_mul_qqd_32_scalar")))]
+)
+
+(define_insn "neon_vqdmulh_lane<mode>"
+  [(set (match_operand:VMDI 0 "s_register_operand" "=w")
+	(unspec:VMDI [(match_operand:VMDI 1 "s_register_operand" "w")
+		      (match_operand:VMDI 2 "s_register_operand"
+					  "<scalar_mul_constraint>")
+                      (match_operand:SI 3 "immediate_operand" "i")
+                      (match_operand:SI 4 "immediate_operand" "i")]
+                      UNSPEC_VQDMULH_LANE))]
+  "TARGET_NEON"
+  "vq%O4dmulh.%T4%#<V_sz_elem>\t%P0, %P1, %P2[%c3]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+                   (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vmla_lane<mode>"
+  [(set (match_operand:VMD 0 "s_register_operand" "=w")
+	(unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0")
+		     (match_operand:VMD 2 "s_register_operand" "w")
+                     (match_operand:VMD 3 "s_register_operand"
+					"<scalar_mul_constraint>")
+                     (match_operand:SI 4 "immediate_operand" "i")
+                     (match_operand:SI 5 "immediate_operand" "i")]
+                     UNSPEC_VMLA_LANE))]
+  "TARGET_NEON"
+  "vmla.<V_if_elem>\t%P0, %P2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (const_string "neon_fp_vmla_ddd_scalar")
+                   (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                 (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+                                 (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))))]
+)
+
+(define_insn "neon_vmla_lane<mode>"
+  [(set (match_operand:VMQ 0 "s_register_operand" "=w")
+	(unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0")
+		     (match_operand:VMQ 2 "s_register_operand" "w")
+                     (match_operand:<V_HALF> 3 "s_register_operand"
+					     "<scalar_mul_constraint>")
+                     (match_operand:SI 4 "immediate_operand" "i")
+                     (match_operand:SI 5 "immediate_operand" "i")]
+                     UNSPEC_VMLA_LANE))]
+  "TARGET_NEON"
+  "vmla.<V_if_elem>\t%q0, %q2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (const_string "neon_fp_vmla_qqq_scalar")
+                   (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                 (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")
+                                 (const_string "neon_mla_qqq_32_qqd_32_scalar"))))]
+)
+
+(define_insn "neon_vmlal_lane<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+			   (match_operand:VMDI 2 "s_register_operand" "w")
+                           (match_operand:VMDI 3 "s_register_operand"
+					       "<scalar_mul_constraint>")
+                           (match_operand:SI 4 "immediate_operand" "i")
+                           (match_operand:SI 5 "immediate_operand" "i")]
+                          UNSPEC_VMLAL_LANE))]
+  "TARGET_NEON"
+  "vmlal.%T5%#<V_sz_elem>\t%q0, %P2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmlal_lane<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+			   (match_operand:VMDI 2 "s_register_operand" "w")
+                           (match_operand:VMDI 3 "s_register_operand"
+					       "<scalar_mul_constraint>")
+                           (match_operand:SI 4 "immediate_operand" "i")
+                           (match_operand:SI 5 "immediate_operand" "i")]
+                          UNSPEC_VQDMLAL_LANE))]
+  "TARGET_NEON"
+  "vqdmlal.<V_s_elem>\t%q0, %P2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vmls_lane<mode>"
+  [(set (match_operand:VMD 0 "s_register_operand" "=w")
+	(unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0")
+		     (match_operand:VMD 2 "s_register_operand" "w")
+                     (match_operand:VMD 3 "s_register_operand"
+					"<scalar_mul_constraint>")
+                     (match_operand:SI 4 "immediate_operand" "i")
+                     (match_operand:SI 5 "immediate_operand" "i")]
+                    UNSPEC_VMLS_LANE))]
+  "TARGET_NEON"
+  "vmls.<V_if_elem>\t%P0, %P2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (const_string "neon_fp_vmla_ddd_scalar")
+                   (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                 (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+                                 (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))))]
+)
+
+(define_insn "neon_vmls_lane<mode>"
+  [(set (match_operand:VMQ 0 "s_register_operand" "=w")
+	(unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0")
+		     (match_operand:VMQ 2 "s_register_operand" "w")
+                     (match_operand:<V_HALF> 3 "s_register_operand"
+					     "<scalar_mul_constraint>")
+                     (match_operand:SI 4 "immediate_operand" "i")
+                     (match_operand:SI 5 "immediate_operand" "i")]
+                    UNSPEC_VMLS_LANE))]
+  "TARGET_NEON"
+  "vmls.<V_if_elem>\t%q0, %q2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                   (const_string "neon_fp_vmla_qqq_scalar")
+                   (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                                 (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")
+                                 (const_string "neon_mla_qqq_32_qqd_32_scalar"))))]
+)
+
+(define_insn "neon_vmlsl_lane<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+			   (match_operand:VMDI 2 "s_register_operand" "w")
+                           (match_operand:VMDI 3 "s_register_operand"
+					       "<scalar_mul_constraint>")
+                           (match_operand:SI 4 "immediate_operand" "i")
+                           (match_operand:SI 5 "immediate_operand" "i")]
+                          UNSPEC_VMLSL_LANE))]
+  "TARGET_NEON"
+  "vmlsl.%T5%#<V_sz_elem>\t%q0, %P2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmlsl_lane<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+			   (match_operand:VMDI 2 "s_register_operand" "w")
+                           (match_operand:VMDI 3 "s_register_operand"
+					       "<scalar_mul_constraint>")
+                           (match_operand:SI 4 "immediate_operand" "i")
+                           (match_operand:SI 5 "immediate_operand" "i")]
+                          UNSPEC_VQDMLSL_LANE))]
+  "TARGET_NEON"
+  "vqdmlsl.<V_s_elem>\t%q0, %P2, %P3[%c4]"
+  [(set (attr "neon_type")
+     (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+                   (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+                   (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+; FIXME: For the "_n" multiply/multiply-accumulate insns, we copy a value in a
+; core register into a temp register, then use a scalar taken from that. This
+; isn't an optimal solution if e.g. the scalar has just been read from memory
+; or extracted from another vector. The latter case it's currently better to
+; use the "_lane" variant, and the former case can probably be implemented
+; using vld1_lane, but that hasn't been done yet.
+
+(define_expand "neon_vmul_n<mode>"
+  [(match_operand:VMD 0 "s_register_operand" "")
+   (match_operand:VMD 1 "s_register_operand" "")
+   (match_operand:<V_elem> 2 "s_register_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+  emit_insn (gen_neon_vmul_lane<mode> (operands[0], operands[1], tmp,
+				       const0_rtx, const0_rtx));
+  DONE;
+})
+
+(define_expand "neon_vmul_n<mode>"
+  [(match_operand:VMQ 0 "s_register_operand" "")
+   (match_operand:VMQ 1 "s_register_operand" "")
+   (match_operand:<V_elem> 2 "s_register_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<V_HALF>mode);
+  emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[2], tmp, const0_rtx));
+  emit_insn (gen_neon_vmul_lane<mode> (operands[0], operands[1], tmp,
+				       const0_rtx, const0_rtx));
+  DONE;
+})
+
+(define_expand "neon_vmull_n<mode>"
+  [(match_operand:<V_widen> 0 "s_register_operand" "")
+   (match_operand:VMDI 1 "s_register_operand" "")
+   (match_operand:<V_elem> 2 "s_register_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+  emit_insn (gen_neon_vmull_lane<mode> (operands[0], operands[1], tmp,
+				        const0_rtx, operands[3]));
+  DONE;
+})
+
+(define_expand "neon_vqdmull_n<mode>"
+  [(match_operand:<V_widen> 0 "s_register_operand" "")
+   (match_operand:VMDI 1 "s_register_operand" "")
+   (match_operand:<V_elem> 2 "s_register_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+  emit_insn (gen_neon_vqdmull_lane<mode> (operands[0], operands[1], tmp,
+				          const0_rtx, const0_rtx));
+  DONE;
+})
+
+(define_expand "neon_vqdmulh_n<mode>"
+  [(match_operand:VMDI 0 "s_register_operand" "")
+   (match_operand:VMDI 1 "s_register_operand" "")
+   (match_operand:<V_elem> 2 "s_register_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+  emit_insn (gen_neon_vqdmulh_lane<mode> (operands[0], operands[1], tmp,
+				          const0_rtx, operands[3]));
+  DONE;
+})
+
+(define_expand "neon_vqdmulh_n<mode>"
+  [(match_operand:VMQI 0 "s_register_operand" "")
+   (match_operand:VMQI 1 "s_register_operand" "")
+   (match_operand:<V_elem> 2 "s_register_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<V_HALF>mode);
+  emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[2], tmp, const0_rtx));
+  emit_insn (gen_neon_vqdmulh_lane<mode> (operands[0], operands[1], tmp,
+				          const0_rtx, operands[3]));
+  DONE;
+})
+
+(define_expand "neon_vmla_n<mode>"
+  [(match_operand:VMD 0 "s_register_operand" "")
+   (match_operand:VMD 1 "s_register_operand" "")
+   (match_operand:VMD 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vmla_lane<mode> (operands[0], operands[1], operands[2],
+				       tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_expand "neon_vmla_n<mode>"
+  [(match_operand:VMQ 0 "s_register_operand" "")
+   (match_operand:VMQ 1 "s_register_operand" "")
+   (match_operand:VMQ 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<V_HALF>mode);
+  emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vmla_lane<mode> (operands[0], operands[1], operands[2],
+				       tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_expand "neon_vmlal_n<mode>"
+  [(match_operand:<V_widen> 0 "s_register_operand" "")
+   (match_operand:<V_widen> 1 "s_register_operand" "")
+   (match_operand:VMDI 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vmlal_lane<mode> (operands[0], operands[1], operands[2],
+					tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_expand "neon_vqdmlal_n<mode>"
+  [(match_operand:<V_widen> 0 "s_register_operand" "")
+   (match_operand:<V_widen> 1 "s_register_operand" "")
+   (match_operand:VMDI 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vqdmlal_lane<mode> (operands[0], operands[1], operands[2],
+					  tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_expand "neon_vmls_n<mode>"
+  [(match_operand:VMD 0 "s_register_operand" "")
+   (match_operand:VMD 1 "s_register_operand" "")
+   (match_operand:VMD 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vmls_lane<mode> (operands[0], operands[1], operands[2],
+				       tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_expand "neon_vmls_n<mode>"
+  [(match_operand:VMQ 0 "s_register_operand" "")
+   (match_operand:VMQ 1 "s_register_operand" "")
+   (match_operand:VMQ 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<V_HALF>mode);
+  emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vmls_lane<mode> (operands[0], operands[1], operands[2],
+				       tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_expand "neon_vmlsl_n<mode>"
+  [(match_operand:<V_widen> 0 "s_register_operand" "")
+   (match_operand:<V_widen> 1 "s_register_operand" "")
+   (match_operand:VMDI 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vmlsl_lane<mode> (operands[0], operands[1], operands[2],
+					tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_expand "neon_vqdmlsl_n<mode>"
+  [(match_operand:<V_widen> 0 "s_register_operand" "")
+   (match_operand:<V_widen> 1 "s_register_operand" "")
+   (match_operand:VMDI 2 "s_register_operand" "")
+   (match_operand:<V_elem> 3 "s_register_operand" "")
+   (match_operand:SI 4 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+  emit_insn (gen_neon_vqdmlsl_lane<mode> (operands[0], operands[1], operands[2],
+					  tmp, const0_rtx, operands[4]));
+  DONE;
+})
+
+(define_insn "neon_vext<mode>"
+  [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+	(unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
+		      (match_operand:VDQX 2 "s_register_operand" "w")
+                      (match_operand:SI 3 "immediate_operand" "i")]
+                     UNSPEC_VEXT))]
+  "TARGET_NEON"
+  "vext.<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2, %3"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_bp_simple")
+                    (const_string "neon_bp_2cycle")))]
+)
+
+(define_insn "neon_vrev64<mode>"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+	(unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
+		     (match_operand:SI 2 "immediate_operand" "i")]
+                    UNSPEC_VREV64))]
+  "TARGET_NEON"
+  "vrev64.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vrev32<mode>"
+  [(set (match_operand:VX 0 "s_register_operand" "=w")
+	(unspec:VX [(match_operand:VX 1 "s_register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+                   UNSPEC_VREV32))]
+  "TARGET_NEON"
+  "vrev32.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vrev16<mode>"
+  [(set (match_operand:VE 0 "s_register_operand" "=w")
+	(unspec:VE [(match_operand:VE 1 "s_register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+                   UNSPEC_VREV16))]
+  "TARGET_NEON"
+  "vrev16.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+  [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+; vbsl_* intrinsics may compile to any of vbsl/vbif/vbit depending on register
+; allocation. For an intrinsic of form:
+;   rD = vbsl_* (rS, rN, rM)
+; We can use any of:
+;   vbsl rS, rN, rM  (if D = S)
+;   vbit rD, rN, rS  (if D = M, so 1-bits in rS choose bits from rN, else rM)
+;   vbif rD, rM, rS  (if D = N, so 0-bits in rS choose bits from rM, else rN)
+
+(define_insn "neon_vbsl<mode>_internal"
+  [(set (match_operand:VDQX 0 "s_register_operand"		 "=w,w,w")
+	(unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" " 0,w,w")
+		      (match_operand:VDQX 2 "s_register_operand" " w,w,0")
+                      (match_operand:VDQX 3 "s_register_operand" " w,0,w")]
+                     UNSPEC_VBSL))]
+  "TARGET_NEON"
+  "@
+  vbsl\t%<V_reg>0, %<V_reg>2, %<V_reg>3
+  vbit\t%<V_reg>0, %<V_reg>2, %<V_reg>1
+  vbif\t%<V_reg>0, %<V_reg>3, %<V_reg>1"
+  [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_expand "neon_vbsl<mode>"
+  [(set (match_operand:VDQX 0 "s_register_operand" "")
+        (unspec:VDQX [(match_operand:<V_cmp_result> 1 "s_register_operand" "")
+                      (match_operand:VDQX 2 "s_register_operand" "")
+                      (match_operand:VDQX 3 "s_register_operand" "")]
+                     UNSPEC_VBSL))]
+  "TARGET_NEON"
+{
+  /* We can't alias operands together if they have different modes.  */
+  operands[1] = gen_lowpart (<MODE>mode, operands[1]);
+})
+
+(define_insn "neon_vshl<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:VDQIX 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VSHL))]
+  "TARGET_NEON"
+  "v%O3shl.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_vshl_ddd")
+                    (const_string "neon_shift_3")))]
+)
+
+(define_insn "neon_vqshl<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:VDQIX 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VQSHL))]
+  "TARGET_NEON"
+  "vq%O3shl.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_shift_2")
+                    (const_string "neon_vqshl_vrshl_vqrshl_qqq")))]
+)
+
+(define_insn "neon_vshr_n<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VSHR_N))]
+  "TARGET_NEON"
+  "v%O3shr.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vshrn_n<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+			    (match_operand:SI 2 "immediate_operand" "i")
+			    (match_operand:SI 3 "immediate_operand" "i")]
+                           UNSPEC_VSHRN_N))]
+  "TARGET_NEON"
+  "v%O3shrn.<V_if_elem>\t%P0, %q1, %2"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vqshrn_n<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+			    (match_operand:SI 2 "immediate_operand" "i")
+			    (match_operand:SI 3 "immediate_operand" "i")]
+                           UNSPEC_VQSHRN_N))]
+  "TARGET_NEON"
+  "vq%O3shrn.%T3%#<V_sz_elem>\t%P0, %q1, %2"
+  [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vqshrun_n<mode>"
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+			    (match_operand:SI 2 "immediate_operand" "i")
+			    (match_operand:SI 3 "immediate_operand" "i")]
+                           UNSPEC_VQSHRUN_N))]
+  "TARGET_NEON"
+  "vq%O3shrun.%T3%#<V_sz_elem>\t%P0, %q1, %2"
+  [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vshl_n<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VSHL_N))]
+  "TARGET_NEON"
+  "vshl.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %2"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vqshl_n<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VQSHL_N))]
+  "TARGET_NEON"
+  "vqshl.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2"
+  [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vqshlu_n<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+		       (match_operand:SI 2 "immediate_operand" "i")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VQSHLU_N))]
+  "TARGET_NEON"
+  "vqshlu.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2"
+  [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vshll_n<mode>"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+			   (match_operand:SI 2 "immediate_operand" "i")
+			   (match_operand:SI 3 "immediate_operand" "i")]
+			  UNSPEC_VSHLL_N))]
+  "TARGET_NEON"
+  "vshll.%T3%#<V_sz_elem>\t%q0, %P1, %2"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vsra_n<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
+		       (match_operand:VDQIX 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")
+                       (match_operand:SI 4 "immediate_operand" "i")]
+                      UNSPEC_VSRA_N))]
+  "TARGET_NEON"
+  "v%O4sra.%T4%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3"
+  [(set_attr "neon_type" "neon_vsra_vrsra")]
+)
+
+(define_insn "neon_vsri_n<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
+        	       (match_operand:VDQIX 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VSRI))]
+  "TARGET_NEON"
+  "vsri.<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_shift_1")
+                    (const_string "neon_shift_3")))]
+)
+
+(define_insn "neon_vsli_n<mode>"
+  [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+	(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
+        	       (match_operand:VDQIX 2 "s_register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_VSLI))]
+  "TARGET_NEON"
+  "vsli.<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_shift_1")
+                    (const_string "neon_shift_3")))]
+)
+
+(define_insn "neon_vtbl1v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "w")
+		      (match_operand:V8QI 2 "s_register_operand" "w")]
+                     UNSPEC_VTBL))]
+  "TARGET_NEON"
+  "vtbl.8\t%P0, {%P1}, %P2"
+  [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbl2v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:TI 1 "s_register_operand" "w")
+		      (match_operand:V8QI 2 "s_register_operand" "w")]
+                     UNSPEC_VTBL))]
+  "TARGET_NEON"
+{
+  rtx ops[4];
+  int tabbase = REGNO (operands[1]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (V8QImode, tabbase);
+  ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+  ops[3] = operands[2];
+  output_asm_insn ("vtbl.8\t%P0, {%P1, %P2}, %P3", ops);
+
+  return "";
+}
+  [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbl3v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:EI 1 "s_register_operand" "w")
+		      (match_operand:V8QI 2 "s_register_operand" "w")]
+                     UNSPEC_VTBL))]
+  "TARGET_NEON"
+{
+  rtx ops[5];
+  int tabbase = REGNO (operands[1]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (V8QImode, tabbase);
+  ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+  ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+  ops[4] = operands[2];
+  output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3}, %P4", ops);
+
+  return "";
+}
+  [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtbl4v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:OI 1 "s_register_operand" "w")
+		      (match_operand:V8QI 2 "s_register_operand" "w")]
+                     UNSPEC_VTBL))]
+  "TARGET_NEON"
+{
+  rtx ops[6];
+  int tabbase = REGNO (operands[1]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (V8QImode, tabbase);
+  ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+  ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+  ops[4] = gen_rtx_REG (V8QImode, tabbase + 6);
+  ops[5] = operands[2];
+  output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops);
+
+  return "";
+}
+  [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtbx1v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+		      (match_operand:V8QI 2 "s_register_operand" "w")
+		      (match_operand:V8QI 3 "s_register_operand" "w")]
+                     UNSPEC_VTBX))]
+  "TARGET_NEON"
+  "vtbx.8\t%P0, {%P2}, %P3"
+  [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbx2v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+		      (match_operand:TI 2 "s_register_operand" "w")
+		      (match_operand:V8QI 3 "s_register_operand" "w")]
+                     UNSPEC_VTBX))]
+  "TARGET_NEON"
+{
+  rtx ops[4];
+  int tabbase = REGNO (operands[2]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (V8QImode, tabbase);
+  ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+  ops[3] = operands[3];
+  output_asm_insn ("vtbx.8\t%P0, {%P1, %P2}, %P3", ops);
+
+  return "";
+}
+  [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbx3v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+		      (match_operand:EI 2 "s_register_operand" "w")
+		      (match_operand:V8QI 3 "s_register_operand" "w")]
+                     UNSPEC_VTBX))]
+  "TARGET_NEON"
+{
+  rtx ops[5];
+  int tabbase = REGNO (operands[2]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (V8QImode, tabbase);
+  ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+  ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+  ops[4] = operands[3];
+  output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3}, %P4", ops);
+
+  return "";
+}
+  [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtbx4v8qi"
+  [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+	(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+		      (match_operand:OI 2 "s_register_operand" "w")
+		      (match_operand:V8QI 3 "s_register_operand" "w")]
+                     UNSPEC_VTBX))]
+  "TARGET_NEON"
+{
+  rtx ops[6];
+  int tabbase = REGNO (operands[2]);
+
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (V8QImode, tabbase);
+  ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+  ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+  ops[4] = gen_rtx_REG (V8QImode, tabbase + 6);
+  ops[5] = operands[3];
+  output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops);
+
+  return "";
+}
+  [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtrn<mode>_internal"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")]
+		     UNSPEC_VTRN1))
+   (set (match_operand:VDQW 2 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 3 "s_register_operand" "2")]
+		     UNSPEC_VTRN2))]
+  "TARGET_NEON"
+  "vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_bp_simple")
+                    (const_string "neon_bp_3cycle")))]
+)
+
+(define_expand "neon_vtrn<mode>"
+  [(match_operand:SI 0 "s_register_operand" "r")
+   (match_operand:VDQW 1 "s_register_operand" "w")
+   (match_operand:VDQW 2 "s_register_operand" "w")]
+  "TARGET_NEON"
+{
+  neon_emit_pair_result_insn (<MODE>mode, gen_neon_vtrn<mode>_internal,
+			      operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_insn "neon_vzip<mode>_internal"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")]
+		     UNSPEC_VZIP1))
+   (set (match_operand:VDQW 2 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 3 "s_register_operand" "2")]
+		     UNSPEC_VZIP2))]
+  "TARGET_NEON"
+  "vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_bp_simple")
+                    (const_string "neon_bp_3cycle")))]
+)
+
+(define_expand "neon_vzip<mode>"
+  [(match_operand:SI 0 "s_register_operand" "r")
+   (match_operand:VDQW 1 "s_register_operand" "w")
+   (match_operand:VDQW 2 "s_register_operand" "w")]
+  "TARGET_NEON"
+{
+  neon_emit_pair_result_insn (<MODE>mode, gen_neon_vzip<mode>_internal,
+			      operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_insn "neon_vuzp<mode>_internal"
+  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+	(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")]
+                     UNSPEC_VUZP1))
+   (set (match_operand:VDQW 2 "s_register_operand" "=w")
+        (unspec:VDQW [(match_operand:VDQW 3 "s_register_operand" "2")]
+		     UNSPEC_VUZP2))]
+  "TARGET_NEON"
+  "vuzp.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+  [(set (attr "neon_type")
+      (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                    (const_string "neon_bp_simple")
+                    (const_string "neon_bp_3cycle")))]
+)
+
+(define_expand "neon_vuzp<mode>"
+  [(match_operand:SI 0 "s_register_operand" "r")
+   (match_operand:VDQW 1 "s_register_operand" "w")
+   (match_operand:VDQW 2 "s_register_operand" "w")]
+  "TARGET_NEON"
+{
+  neon_emit_pair_result_insn (<MODE>mode, gen_neon_vuzp<mode>_internal,
+			      operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv8qi<mode>"
+  [(match_operand:V8QI 0 "s_register_operand" "")
+   (match_operand:VDX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv4hi<mode>"
+  [(match_operand:V4HI 0 "s_register_operand" "")
+   (match_operand:VDX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv2si<mode>"
+  [(match_operand:V2SI 0 "s_register_operand" "")
+   (match_operand:VDX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv2sf<mode>"
+  [(match_operand:V2SF 0 "s_register_operand" "")
+   (match_operand:VDX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretdi<mode>"
+  [(match_operand:DI 0 "s_register_operand" "")
+   (match_operand:VDX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv16qi<mode>"
+  [(match_operand:V16QI 0 "s_register_operand" "")
+   (match_operand:VQX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv8hi<mode>"
+  [(match_operand:V8HI 0 "s_register_operand" "")
+   (match_operand:VQX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv4si<mode>"
+  [(match_operand:V4SI 0 "s_register_operand" "")
+   (match_operand:VQX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv4sf<mode>"
+  [(match_operand:V4SF 0 "s_register_operand" "")
+   (match_operand:VQX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "neon_vreinterpretv2di<mode>"
+  [(match_operand:V2DI 0 "s_register_operand" "")
+   (match_operand:VQX 1 "s_register_operand" "")]
+  "TARGET_NEON"
+{
+  neon_reinterpret (operands[0], operands[1]);
+  DONE;
+})
+
+(define_insn "neon_vld1<mode>"
+  [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+        (unspec:VDQX [(mem:VDQX (match_operand:SI 1 "s_register_operand" "r"))]
+                    UNSPEC_VLD1))]
+  "TARGET_NEON"
+  "vld1.<V_sz_elem>\t%h0, [%1]"
+  [(set_attr "neon_type" "neon_vld1_1_2_regs")]
+)
+
+(define_insn "neon_vld1_lane<mode>"
+  [(set (match_operand:VDX 0 "s_register_operand" "=w")
+        (unspec:VDX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                     (match_operand:VDX 2 "s_register_operand" "0")
+                     (match_operand:SI 3 "immediate_operand" "i")]
+                    UNSPEC_VLD1_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  if (max == 1)
+    return "vld1.<V_sz_elem>\t%P0, [%1]";
+  else
+    return "vld1.<V_sz_elem>\t{%P0[%c3]}, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_mode_nunits>") (const_int 2))
+                    (const_string "neon_vld1_1_2_regs")
+                    (const_string "neon_vld1_vld2_lane")))]
+)
+
+(define_insn "neon_vld1_lane<mode>"
+  [(set (match_operand:VQX 0 "s_register_operand" "=w")
+        (unspec:VQX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                     (match_operand:VQX 2 "s_register_operand" "0")
+                     (match_operand:SI 3 "immediate_operand" "i")]
+                    UNSPEC_VLD1_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[0]);
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+      operands[3] = GEN_INT (lane);
+    }
+  operands[0] = gen_rtx_REG (<V_HALF>mode, regno);
+  if (max == 2)
+    return "vld1.<V_sz_elem>\t%P0, [%1]";
+  else
+    return "vld1.<V_sz_elem>\t{%P0[%c3]}, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_mode_nunits>") (const_int 2))
+                    (const_string "neon_vld1_1_2_regs")
+                    (const_string "neon_vld1_vld2_lane")))]
+)
+
+(define_insn "neon_vld1_dup<mode>"
+  [(set (match_operand:VDX 0 "s_register_operand" "=w")
+        (unspec:VDX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))]
+                    UNSPEC_VLD1_DUP))]
+  "TARGET_NEON"
+{
+  if (GET_MODE_NUNITS (<MODE>mode) > 1)
+    return "vld1.<V_sz_elem>\t{%P0[]}, [%1]";
+  else
+    return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+                    (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")
+                    (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vld1_dup<mode>"
+  [(set (match_operand:VQX 0 "s_register_operand" "=w")
+        (unspec:VQX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))]
+                    UNSPEC_VLD1_DUP))]
+  "TARGET_NEON"
+{
+  if (GET_MODE_NUNITS (<MODE>mode) > 2)
+    return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, [%1]";
+  else
+    return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+                    (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")
+                    (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vst1<mode>"
+  [(set (mem:VDQX (match_operand:SI 0 "s_register_operand" "r"))
+	(unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")]
+		     UNSPEC_VST1))]
+  "TARGET_NEON"
+  "vst1.<V_sz_elem>\t%h1, [%0]"
+  [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")])
+
+(define_insn "neon_vst1_lane<mode>"
+  [(set (mem:<V_elem> (match_operand:SI 0 "s_register_operand" "r"))
+	(vec_select:<V_elem>
+	  (match_operand:VDX 1 "s_register_operand" "w")
+	  (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  if (max == 1)
+    return "vst1.<V_sz_elem>\t{%P1}, [%0]";
+  else
+    return "vst1.<V_sz_elem>\t{%P1[%c2]}, [%0]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_mode_nunits>") (const_int 1))
+                    (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+                    (const_string "neon_vst1_vst2_lane")))])
+
+(define_insn "neon_vst1_lane<mode>"
+  [(set (mem:<V_elem> (match_operand:SI 0 "s_register_operand" "r"))
+        (vec_select:<V_elem>
+           (match_operand:VQX 1 "s_register_operand" "w")
+           (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[1]);
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+      operands[2] = GEN_INT (lane);
+    }
+  operands[1] = gen_rtx_REG (<V_HALF>mode, regno);
+  if (max == 2)
+    return "vst1.<V_sz_elem>\t{%P1}, [%0]";
+  else
+    return "vst1.<V_sz_elem>\t{%P1[%c2]}, [%0]";
+}
+  [(set_attr "neon_type" "neon_vst1_vst2_lane")]
+)
+
+(define_insn "neon_vld2<mode>"
+  [(set (match_operand:TI 0 "s_register_operand" "=w")
+        (unspec:TI [(mem:TI (match_operand:SI 1 "s_register_operand" "r"))
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD2))]
+  "TARGET_NEON"
+{
+  if (<V_sz_elem> == 64)
+    return "vld1.64\t%h0, [%1]";
+  else
+    return "vld2.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+                    (const_string "neon_vld1_1_2_regs")
+                    (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")))]
+)
+
+(define_insn "neon_vld2<mode>"
+  [(set (match_operand:OI 0 "s_register_operand" "=w")
+        (unspec:OI [(mem:OI (match_operand:SI 1 "s_register_operand" "r"))
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD2))]
+  "TARGET_NEON"
+  "vld2.<V_sz_elem>\t%h0, [%1]"
+  [(set_attr "neon_type" "neon_vld2_2_regs_vld1_vld2_all_lanes")])
+
+(define_insn "neon_vld2_lane<mode>"
+  [(set (match_operand:TI 0 "s_register_operand" "=w")
+        (unspec:TI [(mem:<V_two_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (match_operand:TI 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")
+                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD2_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[0]);
+  rtx ops[4];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 2);
+  ops[2] = operands[1];
+  ops[3] = operands[3];
+  output_asm_insn ("vld2.<V_sz_elem>\t{%P0[%c3], %P1[%c3]}, [%2]", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld1_vld2_lane")]
+)
+
+(define_insn "neon_vld2_lane<mode>"
+  [(set (match_operand:OI 0 "s_register_operand" "=w")
+        (unspec:OI [(mem:<V_two_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (match_operand:OI 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")
+                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD2_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[0]);
+  rtx ops[4];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+    }
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 4);
+  ops[2] = operands[1];
+  ops[3] = GEN_INT (lane);
+  output_asm_insn ("vld2.<V_sz_elem>\t{%P0[%c3], %P1[%c3]}, [%2]", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld1_vld2_lane")]
+)
+
+(define_insn "neon_vld2_dup<mode>"
+  [(set (match_operand:TI 0 "s_register_operand" "=w")
+        (unspec:TI [(mem:<V_two_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD2_DUP))]
+  "TARGET_NEON"
+{
+  if (GET_MODE_NUNITS (<MODE>mode) > 1)
+    return "vld2.<V_sz_elem>\t{%e0[], %f0[]}, [%1]";
+  else
+    return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+                    (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")
+                    (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vst2<mode>"
+  [(set (mem:TI (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:TI [(match_operand:TI 1 "s_register_operand" "w")
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST2))]
+  "TARGET_NEON"
+{
+  if (<V_sz_elem> == 64)
+    return "vst1.64\t%h1, [%0]";
+  else
+    return "vst2.<V_sz_elem>\t%h1, [%0]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+                    (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+                    (const_string "neon_vst1_1_2_regs_vst2_2_regs")))]
+)
+
+(define_insn "neon_vst2<mode>"
+  [(set (mem:OI (match_operand:SI 0 "s_register_operand" "r"))
+	(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
+		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		   UNSPEC_VST2))]
+  "TARGET_NEON"
+  "vst2.<V_sz_elem>\t%h1, [%0]"
+  [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")]
+)
+
+(define_insn "neon_vst2_lane<mode>"
+  [(set (mem:<V_two_elem> (match_operand:SI 0 "s_register_operand" "r"))
+	(unspec:<V_two_elem>
+	  [(match_operand:TI 1 "s_register_operand" "w")
+	   (match_operand:SI 2 "immediate_operand" "i")
+	   (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+	  UNSPEC_VST2_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[1]);
+  rtx ops[4];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 2);
+  ops[3] = operands[2];
+  output_asm_insn ("vst2.<V_sz_elem>\t{%P1[%c3], %P2[%c3]}, [%0]", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst1_vst2_lane")]
+)
+
+(define_insn "neon_vst2_lane<mode>"
+  [(set (mem:<V_two_elem> (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:<V_two_elem>
+           [(match_operand:OI 1 "s_register_operand" "w")
+            (match_operand:SI 2 "immediate_operand" "i")
+            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+           UNSPEC_VST2_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[1]);
+  rtx ops[4];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+    }
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 4);
+  ops[3] = GEN_INT (lane);
+  output_asm_insn ("vst2.<V_sz_elem>\t{%P1[%c3], %P2[%c3]}, [%0]", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst1_vst2_lane")]
+)
+
+(define_insn "neon_vld3<mode>"
+  [(set (match_operand:EI 0 "s_register_operand" "=w")
+        (unspec:EI [(mem:EI (match_operand:SI 1 "s_register_operand" "r"))
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD3))]
+  "TARGET_NEON"
+{
+  if (<V_sz_elem> == 64)
+    return "vld1.64\t%h0, [%1]";
+  else
+    return "vld3.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+                    (const_string "neon_vld1_1_2_regs")
+                    (const_string "neon_vld3_vld4")))]
+)
+
+(define_expand "neon_vld3<mode>"
+  [(match_operand:CI 0 "s_register_operand" "=w")
+   (match_operand:SI 1 "s_register_operand" "+r")
+   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON"
+{
+  emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[0],
+                                    operands[1], operands[1]));
+  emit_insn (gen_neon_vld3qb<mode> (operands[0], operands[0],
+                                    operands[1], operands[1]));
+  DONE;
+})
+
+(define_insn "neon_vld3qa<mode>"
+  [(set (match_operand:CI 0 "s_register_operand" "=w")
+        (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
+                    (match_operand:CI 1 "s_register_operand" "0")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD3A))
+   (set (match_operand:SI 2 "s_register_operand" "=r")
+        (plus:SI (match_dup 3)
+		 (const_int 24)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[0]);
+  rtx ops[4];
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 4);
+  ops[2] = gen_rtx_REG (DImode, regno + 8);
+  ops[3] = operands[2];
+  output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld3qb<mode>"
+  [(set (match_operand:CI 0 "s_register_operand" "=w")
+        (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
+                    (match_operand:CI 1 "s_register_operand" "0")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD3B))
+   (set (match_operand:SI 2 "s_register_operand" "=r")
+        (plus:SI (match_dup 3)
+		 (const_int 24)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[0]);
+  rtx ops[4];
+  ops[0] = gen_rtx_REG (DImode, regno + 2);
+  ops[1] = gen_rtx_REG (DImode, regno + 6);
+  ops[2] = gen_rtx_REG (DImode, regno + 10);
+  ops[3] = operands[2];
+  output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld3_lane<mode>"
+  [(set (match_operand:EI 0 "s_register_operand" "=w")
+        (unspec:EI [(mem:<V_three_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (match_operand:EI 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")
+                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD3_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[0]);
+  rtx ops[5];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 2);
+  ops[2] = gen_rtx_REG (DImode, regno + 4);
+  ops[3] = operands[1];
+  ops[4] = operands[3];
+  output_asm_insn ("vld3.<V_sz_elem>\t{%P0[%c4], %P1[%c4], %P2[%c4]}, [%3]",
+                   ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld3_lane<mode>"
+  [(set (match_operand:CI 0 "s_register_operand" "=w")
+        (unspec:CI [(mem:<V_three_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (match_operand:CI 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")
+                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD3_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[0]);
+  rtx ops[5];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+    }
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 4);
+  ops[2] = gen_rtx_REG (DImode, regno + 8);
+  ops[3] = operands[1];
+  ops[4] = GEN_INT (lane);
+  output_asm_insn ("vld3.<V_sz_elem>\t{%P0[%c4], %P1[%c4], %P2[%c4]}, [%3]",
+                   ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld3_dup<mode>"
+  [(set (match_operand:EI 0 "s_register_operand" "=w")
+        (unspec:EI [(mem:<V_three_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD3_DUP))]
+  "TARGET_NEON"
+{
+  if (GET_MODE_NUNITS (<MODE>mode) > 1)
+    {
+      int regno = REGNO (operands[0]);
+      rtx ops[4];
+      ops[0] = gen_rtx_REG (DImode, regno);
+      ops[1] = gen_rtx_REG (DImode, regno + 2);
+      ops[2] = gen_rtx_REG (DImode, regno + 4);
+      ops[3] = operands[1];
+      output_asm_insn ("vld3.<V_sz_elem>\t{%P0[], %P1[], %P2[]}, [%3]", ops);
+      return "";
+    }
+  else
+    return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+                    (const_string "neon_vld3_vld4_all_lanes")
+                    (const_string "neon_vld1_1_2_regs")))])
+
+(define_insn "neon_vst3<mode>"
+  [(set (mem:EI (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:EI [(match_operand:EI 1 "s_register_operand" "w")
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST3))]
+  "TARGET_NEON"
+{
+  if (<V_sz_elem> == 64)
+    return "vst1.64\t%h1, [%0]";
+  else
+    return "vst3.<V_sz_elem>\t%h1, [%0]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+                    (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+                    (const_string "neon_vst2_4_regs_vst3_vst4")))])
+
+(define_expand "neon_vst3<mode>"
+  [(match_operand:SI 0 "s_register_operand" "+r")
+   (match_operand:CI 1 "s_register_operand" "w")
+   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON"
+{
+  emit_insn (gen_neon_vst3qa<mode> (operands[0], operands[0], operands[1]));
+  emit_insn (gen_neon_vst3qb<mode> (operands[0], operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "neon_vst3qa<mode>"
+  [(set (mem:EI (match_operand:SI 1 "s_register_operand" "0"))
+        (unspec:EI [(match_operand:CI 2 "s_register_operand" "w")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST3A))
+   (set (match_operand:SI 0 "s_register_operand" "=r")
+        (plus:SI (match_dup 1)
+		 (const_int 24)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[2]);
+  rtx ops[4];
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 4);
+  ops[3] = gen_rtx_REG (DImode, regno + 8);
+  output_asm_insn ("vst3.<V_sz_elem>\t{%P1, %P2, %P3}, [%0]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst3qb<mode>"
+  [(set (mem:EI (match_operand:SI 1 "s_register_operand" "0"))
+        (unspec:EI [(match_operand:CI 2 "s_register_operand" "w")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST3B))
+   (set (match_operand:SI 0 "s_register_operand" "=r")
+        (plus:SI (match_dup 1)
+		 (const_int 24)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[2]);
+  rtx ops[4];
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno + 2);
+  ops[2] = gen_rtx_REG (DImode, regno + 6);
+  ops[3] = gen_rtx_REG (DImode, regno + 10);
+  output_asm_insn ("vst3.<V_sz_elem>\t{%P1, %P2, %P3}, [%0]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst3_lane<mode>"
+  [(set (mem:<V_three_elem> (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:<V_three_elem>
+           [(match_operand:EI 1 "s_register_operand" "w")
+            (match_operand:SI 2 "immediate_operand" "i")
+            (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+           UNSPEC_VST3_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[1]);
+  rtx ops[5];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 2);
+  ops[3] = gen_rtx_REG (DImode, regno + 4);
+  ops[4] = operands[2];
+  output_asm_insn ("vst3.<V_sz_elem>\t{%P1[%c4], %P2[%c4], %P3[%c4]}, [%0]",
+                   ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst3_vst4_lane")]
+)
+
+(define_insn "neon_vst3_lane<mode>"
+  [(set (mem:<V_three_elem> (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:<V_three_elem>
+           [(match_operand:CI 1 "s_register_operand" "w")
+            (match_operand:SI 2 "immediate_operand" "i")
+            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+           UNSPEC_VST3_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[1]);
+  rtx ops[5];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+    }
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 4);
+  ops[3] = gen_rtx_REG (DImode, regno + 8);
+  ops[4] = GEN_INT (lane);
+  output_asm_insn ("vst3.<V_sz_elem>\t{%P1[%c4], %P2[%c4], %P3[%c4]}, [%0]",
+                   ops);
+  return "";
+}
+[(set_attr "neon_type" "neon_vst3_vst4_lane")])
+
+(define_insn "neon_vld4<mode>"
+  [(set (match_operand:OI 0 "s_register_operand" "=w")
+        (unspec:OI [(mem:OI (match_operand:SI 1 "s_register_operand" "r"))
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD4))]
+  "TARGET_NEON"
+{
+  if (<V_sz_elem> == 64)
+    return "vld1.64\t%h0, [%1]";
+  else
+    return "vld4.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+                    (const_string "neon_vld1_1_2_regs")
+                    (const_string "neon_vld3_vld4")))]
+)
+
+(define_expand "neon_vld4<mode>"
+  [(match_operand:XI 0 "s_register_operand" "=w")
+   (match_operand:SI 1 "s_register_operand" "+r")
+   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON"
+{
+  emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[0],
+                                    operands[1], operands[1]));
+  emit_insn (gen_neon_vld4qb<mode> (operands[0], operands[0],
+                                    operands[1], operands[1]));
+  DONE;
+})
+
+(define_insn "neon_vld4qa<mode>"
+  [(set (match_operand:XI 0 "s_register_operand" "=w")
+        (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
+                    (match_operand:XI 1 "s_register_operand" "0")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD4A))
+   (set (match_operand:SI 2 "s_register_operand" "=r")
+        (plus:SI (match_dup 3)
+		 (const_int 32)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[0]);
+  rtx ops[5];
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 4);
+  ops[2] = gen_rtx_REG (DImode, regno + 8);
+  ops[3] = gen_rtx_REG (DImode, regno + 12);
+  ops[4] = operands[2];
+  output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld4qb<mode>"
+  [(set (match_operand:XI 0 "s_register_operand" "=w")
+        (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
+                    (match_operand:XI 1 "s_register_operand" "0")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD4B))
+   (set (match_operand:SI 2 "s_register_operand" "=r")
+        (plus:SI (match_dup 3)
+		 (const_int 32)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[0]);
+  rtx ops[5];
+  ops[0] = gen_rtx_REG (DImode, regno + 2);
+  ops[1] = gen_rtx_REG (DImode, regno + 6);
+  ops[2] = gen_rtx_REG (DImode, regno + 10);
+  ops[3] = gen_rtx_REG (DImode, regno + 14);
+  ops[4] = operands[2];
+  output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld4_lane<mode>"
+  [(set (match_operand:OI 0 "s_register_operand" "=w")
+        (unspec:OI [(mem:<V_four_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (match_operand:OI 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")
+                    (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD4_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[0]);
+  rtx ops[6];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 2);
+  ops[2] = gen_rtx_REG (DImode, regno + 4);
+  ops[3] = gen_rtx_REG (DImode, regno + 6);
+  ops[4] = operands[1];
+  ops[5] = operands[3];
+  output_asm_insn ("vld4.<V_sz_elem>\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, [%4]",
+                   ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld4_lane<mode>"
+  [(set (match_operand:XI 0 "s_register_operand" "=w")
+        (unspec:XI [(mem:<V_four_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (match_operand:XI 2 "s_register_operand" "0")
+                    (match_operand:SI 3 "immediate_operand" "i")
+                    (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD4_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[3]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[0]);
+  rtx ops[6];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+    }
+  ops[0] = gen_rtx_REG (DImode, regno);
+  ops[1] = gen_rtx_REG (DImode, regno + 4);
+  ops[2] = gen_rtx_REG (DImode, regno + 8);
+  ops[3] = gen_rtx_REG (DImode, regno + 12);
+  ops[4] = operands[1];
+  ops[5] = GEN_INT (lane);
+  output_asm_insn ("vld4.<V_sz_elem>\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, [%4]",
+                   ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld4_dup<mode>"
+  [(set (match_operand:OI 0 "s_register_operand" "=w")
+        (unspec:OI [(mem:<V_four_elem> (match_operand:SI 1 "s_register_operand" "r"))
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VLD4_DUP))]
+  "TARGET_NEON"
+{
+  if (GET_MODE_NUNITS (<MODE>mode) > 1)
+    {
+      int regno = REGNO (operands[0]);
+      rtx ops[5];
+      ops[0] = gen_rtx_REG (DImode, regno);
+      ops[1] = gen_rtx_REG (DImode, regno + 2);
+      ops[2] = gen_rtx_REG (DImode, regno + 4);
+      ops[3] = gen_rtx_REG (DImode, regno + 6);
+      ops[4] = operands[1];
+      output_asm_insn ("vld4.<V_sz_elem>\t{%P0[], %P1[], %P2[], %P3[]}, [%4]",
+                       ops);
+      return "";
+    }
+  else
+    return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+                    (const_string "neon_vld3_vld4_all_lanes")
+                    (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vst4<mode>"
+  [(set (mem:OI (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
+                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST4))]
+  "TARGET_NEON"
+{
+  if (<V_sz_elem> == 64)
+    return "vst1.64\t%h1, [%0]";
+  else
+    return "vst4.<V_sz_elem>\t%h1, [%0]";
+}
+  [(set (attr "neon_type")
+      (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+                    (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+                    (const_string "neon_vst2_4_regs_vst3_vst4")))]
+)
+
+(define_expand "neon_vst4<mode>"
+  [(match_operand:SI 0 "s_register_operand" "+r")
+   (match_operand:XI 1 "s_register_operand" "w")
+   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON"
+{
+  emit_insn (gen_neon_vst4qa<mode> (operands[0], operands[0], operands[1]));
+  emit_insn (gen_neon_vst4qb<mode> (operands[0], operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "neon_vst4qa<mode>"
+  [(set (mem:OI (match_operand:SI 1 "s_register_operand" "0"))
+        (unspec:OI [(match_operand:XI 2 "s_register_operand" "w")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST4A))
+   (set (match_operand:SI 0 "s_register_operand" "=r")
+        (plus:SI (match_dup 1)
+		 (const_int 32)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[2]);
+  rtx ops[5];
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 4);
+  ops[3] = gen_rtx_REG (DImode, regno + 8);
+  ops[4] = gen_rtx_REG (DImode, regno + 12);
+  output_asm_insn ("vst4.<V_sz_elem>\t{%P1, %P2, %P3, %P4}, [%0]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst4qb<mode>"
+  [(set (mem:OI (match_operand:SI 1 "s_register_operand" "0"))
+        (unspec:OI [(match_operand:XI 2 "s_register_operand" "w")
+                    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST4B))
+   (set (match_operand:SI 0 "s_register_operand" "=r")
+        (plus:SI (match_dup 1)
+		 (const_int 32)))]
+  "TARGET_NEON"
+{
+  int regno = REGNO (operands[2]);
+  rtx ops[5];
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno + 2);
+  ops[2] = gen_rtx_REG (DImode, regno + 6);
+  ops[3] = gen_rtx_REG (DImode, regno + 10);
+  ops[4] = gen_rtx_REG (DImode, regno + 14);
+  output_asm_insn ("vst4.<V_sz_elem>\t{%P1, %P2, %P3, %P4}, [%0]!", ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst4_lane<mode>"
+  [(set (mem:<V_four_elem> (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:<V_four_elem>
+           [(match_operand:OI 1 "s_register_operand" "w")
+            (match_operand:SI 2 "immediate_operand" "i")
+            (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+           UNSPEC_VST4_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[1]);
+  rtx ops[6];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 2);
+  ops[3] = gen_rtx_REG (DImode, regno + 4);
+  ops[4] = gen_rtx_REG (DImode, regno + 6);
+  ops[5] = operands[2];
+  output_asm_insn ("vst4.<V_sz_elem>\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, [%0]",
+                   ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst3_vst4_lane")]
+)
+
+(define_insn "neon_vst4_lane<mode>"
+  [(set (mem:<V_four_elem> (match_operand:SI 0 "s_register_operand" "r"))
+        (unspec:<V_four_elem>
+           [(match_operand:XI 1 "s_register_operand" "w")
+            (match_operand:SI 2 "immediate_operand" "i")
+            (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+           UNSPEC_VST4_LANE))]
+  "TARGET_NEON"
+{
+  HOST_WIDE_INT lane = INTVAL (operands[2]);
+  HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+  int regno = REGNO (operands[1]);
+  rtx ops[6];
+  if (lane < 0 || lane >= max)
+    error ("lane out of range");
+  else if (lane >= max / 2)
+    {
+      lane -= max / 2;
+      regno += 2;
+    }
+  ops[0] = operands[0];
+  ops[1] = gen_rtx_REG (DImode, regno);
+  ops[2] = gen_rtx_REG (DImode, regno + 4);
+  ops[3] = gen_rtx_REG (DImode, regno + 8);
+  ops[4] = gen_rtx_REG (DImode, regno + 12);
+  ops[5] = GEN_INT (lane);
+  output_asm_insn ("vst4.<V_sz_elem>\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, [%0]",
+                   ops);
+  return "";
+}
+  [(set_attr "neon_type" "neon_vst3_vst4_lane")]
+)
+
+(define_expand "neon_vand<mode>"
+  [(match_operand:VDQX 0 "s_register_operand" "")
+   (match_operand:VDQX 1 "s_register_operand" "")
+   (match_operand:VDQX 2 "neon_inv_logic_op2" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_and<mode>3<V_suf64> (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "neon_vorr<mode>"
+  [(match_operand:VDQX 0 "s_register_operand" "")
+   (match_operand:VDQX 1 "s_register_operand" "")
+   (match_operand:VDQX 2 "neon_logic_op2" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_ior<mode>3<V_suf64> (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "neon_veor<mode>"
+  [(match_operand:VDQX 0 "s_register_operand" "")
+   (match_operand:VDQX 1 "s_register_operand" "")
+   (match_operand:VDQX 2 "s_register_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_xor<mode>3<V_suf64> (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "neon_vbic<mode>"
+  [(match_operand:VDQX 0 "s_register_operand" "")
+   (match_operand:VDQX 1 "s_register_operand" "")
+   (match_operand:VDQX 2 "neon_logic_op2" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_bic<mode>3_neon (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "neon_vorn<mode>"
+  [(match_operand:VDQX 0 "s_register_operand" "")
+   (match_operand:VDQX 1 "s_register_operand" "")
+   (match_operand:VDQX 2 "neon_inv_logic_op2" "")
+   (match_operand:SI 3 "immediate_operand" "")]
+  "TARGET_NEON"
+{
+  emit_insn (gen_orn<mode>3_neon (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+;; APPLE LOCAL 6150859 begin use NEON instructions for SF math
+;; When possible, use the NEON instructions for single precision floating
+;; point operations. On NEON CPUs, the VFP instructions are not scoreboarded,
+;; so they perform poorly compared to the NEON ones. We use 32x2 vector
+;; instructions and just ignore the upper values.
+
+(define_insn "*addsf3_neon"
+  [(set (match_operand:SF	   0 "s_register_operand" "=t")
+	(plus:SF (match_operand:SF 1 "s_register_operand" "t")
+		 (match_operand:SF 2 "s_register_operand" "t")))]
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+  "vadd.f32\\t%p0, %p1, %p2"
+  [(set_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")]
+)
+
+(define_insn "*subsf3_neon"
+  [(set (match_operand:SF	    0 "s_register_operand" "=t")
+	(minus:SF (match_operand:SF 1 "s_register_operand" "t")
+		  (match_operand:SF 2 "s_register_operand" "t")))]
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+  "vsub.f32\\t%p0, %p1, %p2"
+  [(set_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")]
+)
+
+(define_insn "*mulsf3_neon"
+  [(set (match_operand:SF	   0 "s_register_operand" "+t")
+	(mult:SF (match_operand:SF 1 "s_register_operand" "t")
+		 (match_operand:SF 2 "s_register_operand" "t")))]
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+  "vmul.f32\\t%p0, %p1, %p2"
+  [(set_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")]
+)
+
+;; APPLE LOCAL begin 6197406 disable vmla.f32 and vmls.f32
+;; The multiply-accumulate and multiply-decrement? instructions cause a
+;; pipeline flush such that they are not useful in general.  Disabling
+;; them for now.
+;; Multiply-accumulate insns
+;; 0 = 1 * 2 + 0
+; (define_insn "*mulsf3addsf_neon"
+;   [(set (match_operand:SF		    0 "s_register_operand" "=t")
+; 	(plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
+; 			  (match_operand:SF 3 "s_register_operand" "t"))
+; 		 (match_operand:SF	    1 "s_register_operand" "0")))]
+;   "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+;   "vmla.f32\\t%p0, %p2, %p3"
+;   [(set_attr "neon_type" "neon_fp_vmla_ddd")]
+; )
+
+;; APPLE LOCAL begin 6251664 reversed operands for vmls.f32
+;; 0 = 0 - (1 * 2)
+; (define_insn "*mulsf3subsf_neon"
+;   [(set (match_operand:SF		     0 "s_register_operand" "=t")
+; 	(minus:SF (match_operand:SF	     1 "s_register_operand" "0")
+; 		  (mult:SF (match_operand:SF 2 "s_register_operand" "t")
+; 			   (match_operand:SF 3 "s_register_operand" "t"))))]
+;   "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+;   "vmls.f32\\t%p0, %p2, %p3"
+;   [(set_attr "neon_type" "neon_fp_vmla_ddd")]
+; )
+;; APPLE LOCAL end 6251664 reversed operands for vmls.f32
+;; APPLE LOCAL end 6197406 disable vmla.f32 and vmls.f32
+;; APPLE LOCAL 6150859 end use NEON instructions for SF math
+

Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml?rev=76781&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,1827 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Common code for ARM NEON header file, documentation and test case
+   generators.
+
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 2, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to the Free
+   Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.  *)
+
+(* Shorthand types for vector elements.  *)
+type elts = S8 | S16 | S32 | S64 | F32 | U8 | U16 | U32 | U64 | P8 | P16
+          | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts
+          | Cast of elts * elts | NoElts
+
+type eltclass = Signed | Unsigned | Float | Poly | Int | Bits
+	      | ConvClass of eltclass * eltclass | NoType
+
+(* These vector types correspond directly to C types.  *)
+type vectype = T_int8x8    | T_int8x16
+             | T_int16x4   | T_int16x8
+	     | T_int32x2   | T_int32x4
+	     | T_int64x1   | T_int64x2
+	     | T_uint8x8   | T_uint8x16
+	     | T_uint16x4  | T_uint16x8
+	     | T_uint32x2  | T_uint32x4
+	     | T_uint64x1  | T_uint64x2
+	     | T_float32x2 | T_float32x4
+	     | T_poly8x8   | T_poly8x16
+	     | T_poly16x4  | T_poly16x8
+	     | T_immediate of int * int
+             | T_int8      | T_int16
+             | T_int32     | T_int64
+             | T_uint8     | T_uint16
+             | T_uint32    | T_uint64
+             | T_poly8     | T_poly16
+             | T_float32   | T_arrayof of int * vectype
+             | T_ptrto of vectype | T_const of vectype
+             | T_void      | T_intQI
+             | T_intHI     | T_intSI
+             | T_intDI
+
+(* The meanings of the following are:
+     TImode : "Tetra", two registers (four words).
+     EImode : "hExa", three registers (six words).
+     OImode : "Octa", four registers (eight words).
+     CImode : "dodeCa", six registers (twelve words).
+     XImode : "heXadeca", eight registers (sixteen words).
+*)
+
+type inttype = B_TImode | B_EImode | B_OImode | B_CImode | B_XImode
+
+type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt
+               | PtrTo of shape_elt | CstPtrTo of shape_elt
+	       (* These next ones are used only in the test generator.  *)
+	       | Element_of_dreg	(* Used for "lane" variants.  *)
+	       | Element_of_qreg	(* Likewise.  *)
+	       | All_elements_of_dreg	(* Used for "dup" variants.  *)
+
+type shape_form = All of int * shape_elt
+                | Long
+		| Long_noreg of shape_elt
+		| Wide
+		| Wide_noreg of shape_elt
+		| Narrow
+                | Long_imm
+                | Narrow_imm
+                | Binary_imm of shape_elt
+                | Use_operands of shape_elt array
+                | By_scalar of shape_elt
+                | Unary_scalar of shape_elt
+                | Wide_lane
+                | Wide_scalar
+                | Pair_result of shape_elt
+
+type arity = Arity0 of vectype
+           | Arity1 of vectype * vectype
+	   | Arity2 of vectype * vectype * vectype
+	   | Arity3 of vectype * vectype * vectype * vectype
+           | Arity4 of vectype * vectype * vectype * vectype * vectype
+
+type vecmode = V8QI | V4HI | V2SI | V2SF | DI
+             | V16QI | V8HI | V4SI | V4SF | V2DI
+             | QI | HI | SI | SF
+
+type opcode =
+  (* Binary ops.  *)
+    Vadd
+  | Vmul
+  | Vmla
+  | Vmls
+  | Vsub
+  | Vceq
+  | Vcge
+  | Vcgt
+  | Vcle
+  | Vclt
+  | Vcage
+  | Vcagt
+  | Vcale
+  | Vcalt
+  | Vtst
+  | Vabd
+  | Vaba
+  | Vmax
+  | Vmin
+  | Vpadd
+  | Vpada
+  | Vpmax
+  | Vpmin
+  | Vrecps
+  | Vrsqrts
+  | Vshl
+  | Vshr_n
+  | Vshl_n
+  | Vsra_n
+  | Vsri
+  | Vsli
+  (* Logic binops.  *)
+  | Vand
+  | Vorr
+  | Veor
+  | Vbic
+  | Vorn
+  | Vbsl
+  (* Ops with scalar.  *)
+  | Vmul_lane
+  | Vmla_lane
+  | Vmls_lane
+  | Vmul_n
+  | Vmla_n
+  | Vmls_n
+  | Vmull_n
+  | Vmull_lane
+  | Vqdmull_n
+  | Vqdmull_lane
+  | Vqdmulh_n
+  | Vqdmulh_lane
+  (* Unary ops.  *)
+  | Vabs
+  | Vneg
+  | Vcls
+  | Vclz
+  | Vcnt
+  | Vrecpe
+  | Vrsqrte
+  | Vmvn
+  (* Vector extract.  *)
+  | Vext
+  (* Reverse elements.  *)
+  | Vrev64
+  | Vrev32
+  | Vrev16
+  (* Transposition ops.  *)
+  | Vtrn
+  | Vzip
+  | Vuzp
+  (* Loads and stores (VLD1/VST1/VLD2...), elements and structures.  *)
+  | Vldx of int
+  | Vstx of int
+  | Vldx_lane of int
+  | Vldx_dup of int
+  | Vstx_lane of int
+  (* Set/extract lanes from a vector.  *)
+  | Vget_lane
+  | Vset_lane
+  (* Initialise vector from bit pattern.  *)
+  | Vcreate
+  (* Set all lanes to same value.  *)
+  | Vdup_n
+  | Vmov_n  (* Is this the same?  *)
+  (* Duplicate scalar to all lanes of vector.  *)
+  | Vdup_lane
+  (* Combine vectors.  *)
+  | Vcombine
+  (* Get quadword high/low parts.  *)
+  | Vget_high
+  | Vget_low
+  (* Convert vectors.  *)
+  | Vcvt
+  | Vcvt_n
+  (* Narrow/lengthen vectors.  *)
+  | Vmovn
+  | Vmovl
+  (* Table lookup.  *)
+  | Vtbl of int
+  | Vtbx of int
+  (* Reinterpret casts.  *)
+  | Vreinterp
+
+(* Features used for documentation, to distinguish between some instruction
+   variants, and to signal special requirements (e.g. swapping arguments).  *)
+
+type features =
+    Halving
+  | Rounding
+  | Saturating
+  | Dst_unsign
+  | High_half
+  | Doubling
+  | Flipped of string  (* Builtin name to use with flipped arguments.  *)
+  | InfoWord  (* Pass an extra word for signage/rounding etc. (always passed
+                 for All _, Long, Wide, Narrow shape_forms.  *)
+  | ReturnPtr  (* Pass explicit pointer to return value as first argument.  *)
+    (* A specification as to the shape of instruction expected upon
+       disassembly, used if it differs from the shape used to build the
+       intrinsic prototype.  Multiple entries in the constructor's argument
+       indicate that the intrinsic expands to more than one assembly
+       instruction, each with a corresponding shape specified here.  *)
+  | Disassembles_as of shape_form list
+  | Builtin_name of string  (* Override the name of the builtin.  *)
+    (* Override the name of the instruction.  If more than one name
+       is specified, it means that the instruction can have any of those
+       names.  *)
+  | Instruction_name of string list
+    (* Mark that the intrinsic yields no instructions, or expands to yield
+       behaviour that the test generator cannot test.  *)
+  | No_op
+    (* Mark that the intrinsic has constant arguments that cannot be set
+       to the defaults (zero for pointers and one otherwise) in the test
+       cases.  The function supplied must return the integer to be written
+       into the testcase for the argument number (0-based) supplied to it.  *)
+  | Const_valuator of (int -> int)
+
+exception MixedMode of elts * elts
+
+let rec elt_width = function
+    S8 | U8 | P8 | I8 | B8 -> 8
+  | S16 | U16 | P16 | I16 | B16 -> 16
+  | S32 | F32 | U32 | I32 | B32 -> 32
+  | S64 | U64 | I64 | B64 -> 64
+  | Conv (a, b) ->
+      let wa = elt_width a and wb = elt_width b in
+      if wa = wb then wa else failwith "element width?"
+  | Cast (a, b) -> raise (MixedMode (a, b))
+  | NoElts -> failwith "No elts"
+
+let rec elt_class = function
+    S8 | S16 | S32 | S64 -> Signed
+  | U8 | U16 | U32 | U64 -> Unsigned
+  | P8 | P16 -> Poly
+  | F32 -> Float
+  | I8 | I16 | I32 | I64 -> Int
+  | B8 | B16 | B32 | B64 -> Bits
+  | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b)
+  | NoElts -> NoType
+
+let elt_of_class_width c w =
+  match c, w with
+    Signed, 8 -> S8
+  | Signed, 16 -> S16
+  | Signed, 32 -> S32
+  | Signed, 64 -> S64
+  | Float, 32 -> F32
+  | Unsigned, 8 -> U8
+  | Unsigned, 16 -> U16
+  | Unsigned, 32 -> U32
+  | Unsigned, 64 -> U64
+  | Poly, 8 -> P8
+  | Poly, 16 -> P16
+  | Int, 8 -> I8
+  | Int, 16 -> I16
+  | Int, 32 -> I32
+  | Int, 64 -> I64
+  | Bits, 8 -> B8
+  | Bits, 16 -> B16
+  | Bits, 32 -> B32
+  | Bits, 64 -> B64
+  | _ -> failwith "Bad element type"
+
+(* Return unsigned integer element the same width as argument.  *)
+let unsigned_of_elt elt =
+  elt_of_class_width Unsigned (elt_width elt)
+
+let signed_of_elt elt =
+  elt_of_class_width Signed (elt_width elt)
+
+(* Return untyped bits element the same width as argument.  *)
+let bits_of_elt elt =
+  elt_of_class_width Bits (elt_width elt)
+
+let non_signed_variant = function
+    S8 -> I8
+  | S16 -> I16
+  | S32 -> I32
+  | S64 -> I64
+  | U8 -> I8
+  | U16 -> I16
+  | U32 -> I32
+  | U64 -> I64
+  | x -> x
+
+let poly_unsigned_variant v =
+  let elclass = match elt_class v with
+    Poly -> Unsigned
+  | x -> x in
+  elt_of_class_width elclass (elt_width v)
+
+let widen_elt elt =
+  let w = elt_width elt
+  and c = elt_class elt in
+  elt_of_class_width c (w * 2)
+
+let narrow_elt elt =
+  let w = elt_width elt
+  and c = elt_class elt in
+  elt_of_class_width c (w / 2)
+
+(* If we're trying to find a mode from a "Use_operands" instruction, use the
+   last vector operand as the dominant mode used to invoke the correct builtin.
+   We must stick to this rule in neon.md.  *)
+let find_key_operand operands =
+  let rec scan opno =
+    match operands.(opno) with
+      Qreg -> Qreg
+    | Dreg -> Dreg
+    | VecArray (_, Qreg) -> Qreg
+    | VecArray (_, Dreg) -> Dreg
+    | _ -> scan (opno-1)
+  in
+    scan ((Array.length operands) - 1)
+
+let rec mode_of_elt elt shape =
+  let flt = match elt_class elt with
+    Float | ConvClass(_, Float) -> true | _ -> false in
+  let idx =
+    match elt_width elt with
+      8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3
+    | _ -> failwith "Bad element width"
+  in match shape with
+    All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg
+  | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg ->
+      [| V8QI; V4HI; if flt then V2SF else V2SI; DI |].(idx)
+  | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg
+  | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg ->
+      [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI |].(idx)
+  | All (_, (Corereg | PtrTo _ | CstPtrTo _)) ->
+      [| QI; HI; if flt then SF else SI; DI |].(idx)
+  | Long | Wide | Wide_lane | Wide_scalar
+  | Long_imm ->
+      [| V8QI; V4HI; V2SI; DI |].(idx)
+  | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx)
+  | Use_operands ops -> mode_of_elt elt (All (0, (find_key_operand ops)))
+  | _ -> failwith "invalid shape"
+
+(* Modify an element type dependent on the shape of the instruction and the
+   operand number.  *)
+
+let shapemap shape no =
+  let ident = fun x -> x in
+  match shape with
+    All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _
+  | Binary_imm _ -> ident
+  | Long | Long_noreg _ | Wide_scalar | Long_imm ->
+      [| widen_elt; ident; ident |].(no)
+  | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no)
+  | Wide_lane -> [| widen_elt; ident; ident; ident |].(no)
+  | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no)
+
+(* Register type (D/Q) of an operand, based on shape and operand number.  *)
+
+let regmap shape no =
+  match shape with
+    All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg
+  | Long -> [| Qreg; Dreg; Dreg |].(no)
+  | Wide -> [| Qreg; Qreg; Dreg |].(no)
+  | Narrow -> [| Dreg; Qreg; Qreg |].(no)
+  | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no)
+  | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no)
+  | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no)
+  | Unary_scalar reg -> [| reg; Dreg; Immed |].(no)
+  | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no)
+  | Binary_imm reg -> [| reg; reg; Immed |].(no)
+  | Long_imm -> [| Qreg; Dreg; Immed |].(no)
+  | Narrow_imm -> [| Dreg; Qreg; Immed |].(no)
+  | Use_operands these -> these.(no)
+
+let type_for_elt shape elt no =
+  let elt = (shapemap shape no) elt in
+  let reg = regmap shape no in
+  let rec type_for_reg_elt reg elt =
+    match reg with
+      Dreg ->
+        begin match elt with
+          S8 -> T_int8x8
+        | S16 -> T_int16x4
+        | S32 -> T_int32x2
+        | S64 -> T_int64x1
+        | U8 -> T_uint8x8
+        | U16 -> T_uint16x4
+        | U32 -> T_uint32x2
+        | U64 -> T_uint64x1
+        | F32 -> T_float32x2
+        | P8 -> T_poly8x8
+        | P16 -> T_poly16x4
+        | _ -> failwith "Bad elt type"
+        end
+    | Qreg ->
+        begin match elt with
+          S8 -> T_int8x16
+        | S16 -> T_int16x8
+        | S32 -> T_int32x4
+        | S64 -> T_int64x2
+        | U8 -> T_uint8x16
+        | U16 -> T_uint16x8
+        | U32 -> T_uint32x4
+        | U64 -> T_uint64x2
+        | F32 -> T_float32x4
+        | P8 -> T_poly8x16
+        | P16 -> T_poly16x8
+        | _ -> failwith "Bad elt type"
+        end
+    | Corereg ->
+        begin match elt with
+          S8 -> T_int8
+        | S16 -> T_int16
+        | S32 -> T_int32
+        | S64 -> T_int64
+        | U8 -> T_uint8
+        | U16 -> T_uint16
+        | U32 -> T_uint32
+        | U64 -> T_uint64
+        | P8 -> T_poly8
+        | P16 -> T_poly16
+        | F32 -> T_float32
+        | _ -> failwith "Bad elt type"
+        end
+    | Immed ->
+        T_immediate (0, 0)
+    | VecArray (num, sub) ->
+        T_arrayof (num, type_for_reg_elt sub elt)
+    | PtrTo x ->
+        T_ptrto (type_for_reg_elt x elt)
+    | CstPtrTo x ->
+        T_ptrto (T_const (type_for_reg_elt x elt))
+    (* Anything else is solely for the use of the test generator.  *)
+    | _ -> assert false
+  in
+    type_for_reg_elt reg elt
+
+(* Return size of a vector type, in bits.  *)
+let vectype_size = function
+    T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1
+  | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1
+  | T_float32x2 | T_poly8x8 | T_poly16x4 -> 64
+  | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2
+  | T_uint8x16 | T_uint16x8  | T_uint32x4  | T_uint64x2
+  | T_float32x4 | T_poly8x16 | T_poly16x8 -> 128
+  | _ -> raise Not_found
+  
+let inttype_for_array num elttype =
+  let eltsize = vectype_size elttype in
+  let numwords = (num * eltsize) / 32 in
+  match numwords with
+    4 -> B_TImode
+  | 6 -> B_EImode
+  | 8 -> B_OImode
+  | 12 -> B_CImode
+  | 16 -> B_XImode
+  | _ -> failwith ("no int type for size " ^ string_of_int numwords)
+
+(* These functions return pairs of (internal, external) types, where "internal"
+   types are those seen by GCC, and "external" are those seen by the assembler.
+   These types aren't necessarily the same, since the intrinsics can munge more
+   than one C type into each assembler opcode.  *)
+
+let make_sign_invariant func shape elt =
+  let arity, elt' = func shape elt in
+  arity, non_signed_variant elt'
+
+(* Don't restrict any types.  *)
+
+let elts_same make_arity shape elt =
+  let vtype = type_for_elt shape elt in
+  make_arity vtype, elt
+
+(* As sign_invar_*, but when sign matters.  *)
+let elts_same_io_lane =
+  elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3))
+
+let elts_same_io =
+  elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2))
+
+let elts_same_2_lane =
+  elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3))
+
+let elts_same_3 = elts_same_2_lane
+
+let elts_same_2 =
+  elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2))
+
+let elts_same_1 =
+  elts_same (fun vtype -> Arity1 (vtype 0, vtype 1))
+
+(* Use for signed/unsigned invariant operations (i.e. where the operation
+   doesn't depend on the sign of the data.  *)
+
+let sign_invar_io_lane = make_sign_invariant elts_same_io_lane
+let sign_invar_io = make_sign_invariant elts_same_io
+let sign_invar_2_lane = make_sign_invariant elts_same_2_lane
+let sign_invar_2 = make_sign_invariant elts_same_2
+let sign_invar_1 = make_sign_invariant elts_same_1
+
+(* Sign-sensitive comparison.  *)
+
+let cmp_sign_matters shape elt =
+  let vtype = type_for_elt shape elt
+  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
+  Arity2 (rtype, vtype 1, vtype 2), elt
+
+(* Signed/unsigned invariant comparison.  *)
+
+let cmp_sign_invar shape elt =
+  let shape', elt' = cmp_sign_matters shape elt in
+  let elt'' =
+    match non_signed_variant elt' with
+      P8 -> I8
+    | x -> x
+  in
+    shape', elt''
+
+(* Comparison (VTST) where only the element width matters.  *)
+
+let cmp_bits shape elt =
+  let vtype = type_for_elt shape elt
+  and rtype = type_for_elt shape (unsigned_of_elt elt) 0
+  and bits_only = bits_of_elt elt in
+  Arity2 (rtype, vtype 1, vtype 2), bits_only
+
+let reg_shift shape elt =
+  let vtype = type_for_elt shape elt
+  and op2type = type_for_elt shape (signed_of_elt elt) 2 in
+  Arity2 (vtype 0, vtype 1, op2type), elt
+
+(* Genericised constant-shift type-generating function.  *)
+
+let const_shift mkimm ?arity ?result shape elt =
+  let op2type = (shapemap shape 2) elt in
+  let op2width = elt_width op2type in
+  let op2 = mkimm op2width
+  and op1 = type_for_elt shape elt 1
+  and r_elt =
+    match result with
+      None -> elt
+    | Some restriction -> restriction elt in
+  let rtype = type_for_elt shape r_elt 0 in
+  match arity with
+    None -> Arity2 (rtype, op1, op2), elt
+  | Some mkarity -> mkarity rtype op1 op2, elt
+
+(* Use for immediate right-shifts.  *)
+
+let shift_right shape elt =
+  const_shift (fun imm -> T_immediate (1, imm)) shape elt
+
+let shift_right_acc shape elt =
+  const_shift (fun imm -> T_immediate (1, imm))
+    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt
+
+(* Use for immediate right-shifts when the operation doesn't care about
+   signedness.  *)
+
+let shift_right_sign_invar =
+  make_sign_invariant shift_right
+
+(* Immediate right-shift; result is unsigned even when operand is signed.  *)
+
+let shift_right_to_uns shape elt =
+  const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt
+    shape elt
+
+(* Immediate left-shift.  *)
+
+let shift_left shape elt =
+  const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt
+
+(* Immediate left-shift, unsigned result.  *)
+
+let shift_left_to_uns shape elt =
+  const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt
+    shape elt
+
+(* Immediate left-shift, don't care about signs.  *)
+
+let shift_left_sign_invar =
+  make_sign_invariant shift_left
+
+(* Shift left/right and insert: only element size matters.  *)
+
+let shift_insert shape elt =
+  let arity, elt =
+    const_shift (fun imm -> T_immediate (1, imm))
+    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in
+  arity, bits_of_elt elt
+
+(* Get/set lane.  *)
+
+let get_lane shape elt =
+  let vtype = type_for_elt shape elt in
+  Arity2 (vtype 0, vtype 1, vtype 2),
+    (match elt with P8 -> U8 | P16 -> U16 | x -> x)
+
+let set_lane shape elt =
+  let vtype = type_for_elt shape elt in
+  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
+
+let set_lane_notype shape elt =
+  let vtype = type_for_elt shape elt in
+  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts
+
+let create_vector shape elt =
+  let vtype = type_for_elt shape U64 1
+  and rtype = type_for_elt shape elt 0 in
+  Arity1 (rtype, vtype), elt
+
+let conv make_arity shape elt =
+  let edest, esrc = match elt with
+    Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc
+  | _ -> failwith "Non-conversion element in conversion" in
+  let vtype = type_for_elt shape esrc
+  and rtype = type_for_elt shape edest 0 in
+  make_arity rtype vtype, elt
+
+let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1))
+let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2))
+
+(* Operation has an unsigned result even if operands are signed.  *)
+
+let dst_unsign make_arity shape elt =
+  let vtype = type_for_elt shape elt
+  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
+  make_arity rtype vtype, elt
+
+let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1))
+
+let make_bits_only func shape elt =
+  let arity, elt' = func shape elt in
+  arity, bits_of_elt elt'
+
+(* Extend operation.  *)
+
+let extend shape elt =
+  let vtype = type_for_elt shape elt in
+  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
+
+(* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned
+   integer ops respectively, or unsigned for polynomial ops.  *)
+
+let table mkarity shape elt =
+  let vtype = type_for_elt shape elt in
+  let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in
+  mkarity vtype op2, bits_of_elt elt
+
+let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2))
+let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2))
+
+(* Operations where only bits matter.  *)
+
+let bits_1 = make_bits_only elts_same_1
+let bits_2 = make_bits_only elts_same_2
+let bits_3 = make_bits_only elts_same_3
+
+(* Store insns.  *)
+let store_1 shape elt =
+  let vtype = type_for_elt shape elt in
+  Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt
+
+let store_3 shape elt =
+  let vtype = type_for_elt shape elt in
+  Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt
+
+let make_notype func shape elt =
+  let arity, _ = func shape elt in
+  arity, NoElts
+
+let notype_1 = make_notype elts_same_1
+let notype_2 = make_notype elts_same_2
+let notype_3 = make_notype elts_same_3
+
+(* Bit-select operations (first operand is unsigned int).  *)
+
+let bit_select shape elt =
+  let vtype = type_for_elt shape elt
+  and itype = type_for_elt shape (unsigned_of_elt elt) in
+  Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts
+
+(* Common lists of supported element types.  *)
+
+let su_8_32 = [S8; S16; S32; U8; U16; U32]
+let su_8_64 = S64 :: U64 :: su_8_32
+let su_16_64 = [S16; S32; S64; U16; U32; U64]
+let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
+let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
+
+let ops =
+  [
+    (* Addition.  *)
+    Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_64;
+    Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
+    Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
+    Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
+    Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32;
+    Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32;
+    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
+      All (3, Dreg), "vRhadd", elts_same_2, su_8_32;
+    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
+      All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32;
+    Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64;
+    Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64;
+    Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64;
+    Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half],
+      Narrow, "vRaddhn", sign_invar_2, su_16_64;
+ 
+    (* Multiplication.  *)
+    Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32;
+    Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32;
+    Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh",
+      elts_same_2, [S16; S32];
+    Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ",
+      elts_same_2, [S16; S32];
+    Vmul,
+      [Saturating; Rounding; Doubling; High_half;
+       Instruction_name ["vqrdmulh"]],
+      All (3, Dreg), "vqRdmulh",
+      elts_same_2, [S16; S32];
+    Vmul,
+      [Saturating; Rounding; Doubling; High_half;
+       Instruction_name ["vqrdmulh"]],
+      All (3, Qreg), "vqRdmulhQ",
+      elts_same_2, [S16; S32];
+    Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32;
+    Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32];
+    
+    (* Multiply-accumulate. *)
+    Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32;
+    Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32;
+    Vmla, [], Long, "vmlal", elts_same_io, su_8_32;
+    Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32];
+    
+    (* Multiply-subtract.  *)
+    Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32;
+    Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32;
+    Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
+    Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
+    
+    (* Subtraction.  *)
+    Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_64;
+    Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
+    Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
+    Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
+    Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32;
+    Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32;
+    Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64;
+    Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64;
+    Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64;
+    Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half],
+      Narrow, "vRsubhn", sign_invar_2, su_16_64;
+    
+    (* Comparison, equal.  *)
+    Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32;
+    Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32;
+    
+    (* Comparison, greater-than or equal.  *)
+    Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: su_8_32;
+    Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: su_8_32;
+    
+    (* Comparison, less-than or equal.  *)
+    Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters,
+      F32 :: su_8_32;
+    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"],
+      All (3, Qreg), "vcleQ", cmp_sign_matters,
+      F32 :: su_8_32;
+    
+    (* Comparison, greater-than.  *)
+    Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: su_8_32;
+    Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: su_8_32;
+    
+    (* Comparison, less-than.  *)
+    Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters,
+      F32 :: su_8_32;
+    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"],
+      All (3, Qreg), "vcltQ", cmp_sign_matters,
+      F32 :: su_8_32;
+    
+    (* Compare absolute greater-than or equal.  *)
+    Vcage, [Instruction_name ["vacge"]],
+      All (3, Dreg), "vcage", cmp_sign_matters, [F32];
+    Vcage, [Instruction_name ["vacge"]],
+      All (3, Qreg), "vcageQ", cmp_sign_matters, [F32];
+
+    (* Compare absolute less-than or equal.  *)
+    Vcale, [Instruction_name ["vacge"]; Flipped "vcage"],
+      All (3, Dreg), "vcale", cmp_sign_matters, [F32];
+    Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"],
+      All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32];
+    
+    (* Compare absolute greater-than or equal.  *)
+    Vcagt, [Instruction_name ["vacgt"]],
+      All (3, Dreg), "vcagt", cmp_sign_matters, [F32];
+    Vcagt, [Instruction_name ["vacgt"]],
+      All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32];
+    
+    (* Compare absolute less-than or equal.  *)
+    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"],
+      All (3, Dreg), "vcalt", cmp_sign_matters, [F32];
+    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"],
+      All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32];
+    
+    (* Test bits.  *)
+    Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32;
+    Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32;
+
+    (* Absolute difference.  *)
+    Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32;
+    Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32;
+    Vabd, [], Long, "vabdl", elts_same_2, su_8_32;
+
+    (* Absolute difference and accumulate.  *)
+    Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32;
+    Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32;
+    Vaba, [], Long, "vabal", elts_same_io, su_8_32;
+
+    (* Max.  *)
+    Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32;
+    Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32;
+    
+    (* Min.  *)
+    Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32;
+    Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32;
+
+    (* Pairwise add.  *)
+    Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32;
+    Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32;
+    Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32;
+      
+    (* Pairwise add, widen and accumulate.  *)
+    Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32;
+    Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32;
+
+    (* Folding maximum, minimum.  *)
+    Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32;
+    Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32;
+    
+    (* Reciprocal step.  *)
+    Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32];
+    Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32];
+    Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32];
+    Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32];
+
+    (* Vector shift left.  *)
+    Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64;
+    Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64;
+    Vshl, [Instruction_name ["vrshl"]; Rounding],
+      All (3, Dreg), "vRshl", reg_shift, su_8_64;
+    Vshl, [Instruction_name ["vrshl"]; Rounding],
+      All (3, Qreg), "vRshlQ", reg_shift, su_8_64;
+    Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64;
+    Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64;
+    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
+      All (3, Dreg), "vqRshl", reg_shift, su_8_64;
+    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
+      All (3, Qreg), "vqRshlQ", reg_shift, su_8_64;
+
+    (* Vector shift right by constant.  *)
+    Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64;
+    Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64;
+    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg,
+      "vRshr_n", shift_right, su_8_64;
+    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg,
+      "vRshrQ_n", shift_right, su_8_64;
+    Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64;
+    Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n",
+      shift_right_sign_invar, su_16_64;
+    Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64;
+    Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm,
+      "vqRshrn_n", shift_right, su_16_64;
+    Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n",
+      shift_right_to_uns, [S16; S32; S64];
+    Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding],
+      Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64];
+
+    (* Vector shift left by constant.  *)
+    Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64;
+    Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64;
+    Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64;
+    Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64;
+    Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n",
+      shift_left_to_uns, [S8; S16; S32; S64];
+    Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n",
+      shift_left_to_uns, [S8; S16; S32; S64];
+    Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32;
+
+    (* Vector shift right by constant and accumulate.  *)
+    Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64;
+    Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64;
+    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg,
+      "vRsra_n", shift_right_acc, su_8_64;
+    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg,
+      "vRsraQ_n", shift_right_acc, su_8_64;
+
+    (* Vector shift right and insert.  *)
+    Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
+      P8 :: P16 :: su_8_64;
+    Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
+      P8 :: P16 :: su_8_64;
+    
+    (* Vector shift left and insert.  *)
+    Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
+      P8 :: P16 :: su_8_64;
+    Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
+      P8 :: P16 :: su_8_64;
+
+    (* Absolute value.  *)
+    Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32];
+    Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32];
+    Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32];
+    Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32];
+    
+    (* Negate.  *)
+    Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32];
+    Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32];
+    Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32];
+    Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32];
+    
+    (* Bitwise not.  *)
+    Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32;
+    Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32;
+    
+    (* Count leading sign bits.  *)
+    Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32];
+    Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32];
+    
+    (* Count leading zeros.  *)
+    Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32;
+    Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32;
+    
+    (* Count number of set bits.  *)
+    Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8];
+    Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8];
+    
+    (* Reciprocal estimate.  *)
+    Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32];
+    Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32];
+    
+    (* Reciprocal square-root estimate.  *)
+    Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32];
+    Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32];
+    
+    (* Get lanes from a vector.  *)
+    Vget_lane,
+      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
+       Instruction_name ["vmov"]],
+      Use_operands [| Corereg; Dreg; Immed |],
+      "vget_lane", get_lane, pf_su_8_32;
+    Vget_lane,
+      [InfoWord;
+       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
+       Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+      Use_operands [| Corereg; Dreg; Immed |],
+      "vget_lane", notype_2, [S64; U64];
+    Vget_lane,
+      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
+       Instruction_name ["vmov"]],
+      Use_operands [| Corereg; Qreg; Immed |],
+      "vgetQ_lane", get_lane, pf_su_8_32;
+    Vget_lane,
+      [InfoWord;
+       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
+       Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+      Use_operands [| Corereg; Qreg; Immed |],
+      "vgetQ_lane", notype_2, [S64; U64];
+    
+    (* Set lanes in a vector.  *)
+    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
+                Instruction_name ["vmov"]],
+      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
+      set_lane, pf_su_8_32;
+    Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
+                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
+      set_lane_notype, [S64; U64];
+    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
+                Instruction_name ["vmov"]],
+      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
+      set_lane, pf_su_8_32;
+    Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
+                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
+      set_lane_notype, [S64; U64];
+      
+    (* Create vector from literal bit pattern.  *)
+    Vcreate,
+      [No_op], (* Not really, but it can yield various things that are too
+                  hard for the test generator at this time.  *)
+      Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
+      pf_su_8_64;
+    
+    (* Set all lanes to the same value.  *)
+    Vdup_n, [],
+      Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
+      pf_su_8_32;
+    Vdup_n,
+      [Instruction_name ["vmov"];
+       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
+      Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
+      [S64; U64];
+    Vdup_n, [],
+      Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
+      pf_su_8_32;
+    Vdup_n,
+      [Instruction_name ["vmov"];
+       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
+                        Use_operands [| Dreg; Corereg; Corereg |]]],
+      Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1,
+      [S64; U64];
+
+    (* These are just aliases for the above.  *)
+    Vmov_n,
+      [Builtin_name "vdup_n"],
+      Use_operands [| Dreg; Corereg |],
+      "vmov_n", bits_1, pf_su_8_32;
+    Vmov_n,
+      [Builtin_name "vdup_n";
+       Instruction_name ["vmov"];
+       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
+      Use_operands [| Dreg; Corereg |],
+      "vmov_n", notype_1, [S64; U64];
+    Vmov_n,
+      [Builtin_name "vdupQ_n"],
+      Use_operands [| Qreg; Corereg |],
+      "vmovQ_n", bits_1, pf_su_8_32;
+    Vmov_n,
+      [Builtin_name "vdupQ_n";
+       Instruction_name ["vmov"];
+       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
+                        Use_operands [| Dreg; Corereg; Corereg |]]],
+      Use_operands [| Qreg; Corereg |],
+      "vmovQ_n", notype_1, [S64; U64];
+
+    (* Duplicate, lane version.  We can't use Use_operands here because the
+       rightmost register (always Dreg) would be picked up by find_key_operand,
+       when we want the leftmost register to be used in this case (otherwise
+       the modes are indistinguishable in neon.md, etc.  *)
+    Vdup_lane,
+      [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]],
+      Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32;
+    Vdup_lane,
+      [No_op; Const_valuator (fun _ -> 0)],
+      Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64];
+    Vdup_lane,
+      [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]],
+      Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32;
+    Vdup_lane,
+      [No_op; Const_valuator (fun _ -> 0)],
+      Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64];
+
+    (* Combining vectors.  *)
+    Vcombine, [No_op],
+      Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
+      pf_su_8_64;
+    
+    (* Splitting vectors.  *)
+    Vget_high, [No_op],
+      Use_operands [| Dreg; Qreg |], "vget_high",
+      notype_1, pf_su_8_64;
+    Vget_low, [Instruction_name ["vmov"];
+               Disassembles_as [Use_operands [| Dreg; Dreg |]]],
+      Use_operands [| Dreg; Qreg |], "vget_low",
+      notype_1, pf_su_8_64;
+    
+    (* Conversions.  *)
+    Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1,
+      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+    Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1,
+      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+    Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2,
+      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+    Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2,
+      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+      
+    (* Move, narrowing.  *)
+    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]],
+      Narrow, "vmovn", sign_invar_1, su_16_64;
+    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating],
+      Narrow, "vqmovn", elts_same_1, su_16_64;
+    Vmovn,
+      [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign],
+      Narrow, "vqmovun", dst_unsign_1,
+      [S16; S32; S64];
+
+    (* Move, long.  *)
+    Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]],
+      Long, "vmovl", elts_same_1, su_8_32;
+    
+    (* Table lookup.  *)
+    Vtbl 1,
+      [Instruction_name ["vtbl"];
+       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
+      Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8];
+    Vtbl 2, [Instruction_name ["vtbl"]],
+      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2,
+      [U8; S8; P8];
+    Vtbl 3, [Instruction_name ["vtbl"]],
+      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2,
+      [U8; S8; P8];
+    Vtbl 4, [Instruction_name ["vtbl"]],
+      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2,
+      [U8; S8; P8];
+    
+    (* Extended table lookup.  *)
+    Vtbx 1,
+      [Instruction_name ["vtbx"];
+       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
+      Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8];
+    Vtbx 2, [Instruction_name ["vtbx"]],
+      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io,
+      [U8; S8; P8];
+    Vtbx 3, [Instruction_name ["vtbx"]],
+      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io,
+      [U8; S8; P8];
+    Vtbx 4, [Instruction_name ["vtbx"]],
+      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io,
+      [U8; S8; P8];
+    
+    (* Multiply, lane.  (note: these were undocumented at the time of
+       writing).  *)
+    Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane,
+      [S16; S32; U16; U32; F32];
+    Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane,
+      [S16; S32; U16; U32; F32];
+    
+    (* Multiply-accumulate, lane.  *)
+    Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane,
+      [S16; S32; U16; U32; F32];
+    Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane,
+      [S16; S32; U16; U32; F32];
+    Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane,
+      [S16; S32; U16; U32];
+    Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane",
+      elts_same_io_lane, [S16; S32];
+    
+    (* Multiply-subtract, lane.  *)
+    Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane,
+      [S16; S32; U16; U32; F32];
+    Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane,
+      [S16; S32; U16; U32; F32];
+    Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane,
+      [S16; S32; U16; U32];
+    Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane",
+      elts_same_io_lane, [S16; S32];
+
+    (* Long multiply, lane.  *)
+    Vmull_lane, [],
+      Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32];
+
+    (* Saturating doubling long multiply, lane.  *)
+    Vqdmull_lane, [Saturating; Doubling],
+      Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32];
+
+    (* Saturating doubling long multiply high, lane.  *)
+    Vqdmulh_lane, [Saturating; Halving],
+      By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32];
+    Vqdmulh_lane, [Saturating; Halving],
+      By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32];
+    Vqdmulh_lane, [Saturating; Halving; Rounding;
+		   Instruction_name ["vqrdmulh"]],
+      By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32];
+    Vqdmulh_lane, [Saturating; Halving; Rounding;
+		   Instruction_name ["vqrdmulh"]],
+      By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32];
+
+    (* Vector multiply by scalar.  *)
+    Vmul_n, [InfoWord;
+             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+             Use_operands [| Dreg; Dreg; Corereg |], "vmul_n",
+      sign_invar_2, [S16; S32; U16; U32; F32];
+    Vmul_n, [InfoWord;
+             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+             Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n",
+      sign_invar_2, [S16; S32; U16; U32; F32];
+
+    (* Vector long multiply by scalar.  *)
+    Vmull_n, [Instruction_name ["vmull"];
+              Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]],
+              Wide_scalar, "vmull_n",
+      elts_same_2, [S16; S32; U16; U32];
+
+    (* Vector saturating doubling long multiply by scalar.  *)
+    Vqdmull_n, [Saturating; Doubling;
+	        Disassembles_as [Use_operands [| Qreg; Dreg;
+						 Element_of_dreg |]]],
+                Wide_scalar, "vqdmull_n",
+      elts_same_2, [S16; S32];
+
+    (* Vector saturating doubling long multiply high by scalar.  *)
+    Vqdmulh_n,
+      [Saturating; Halving; InfoWord;
+       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+      Use_operands [| Qreg; Qreg; Corereg |],
+      "vqdmulhQ_n", elts_same_2, [S16; S32];
+    Vqdmulh_n,
+      [Saturating; Halving; InfoWord;
+       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+      Use_operands [| Dreg; Dreg; Corereg |],
+      "vqdmulh_n", elts_same_2, [S16; S32];
+    Vqdmulh_n,
+      [Saturating; Halving; Rounding; InfoWord;
+       Instruction_name ["vqrdmulh"];
+       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+      Use_operands [| Qreg; Qreg; Corereg |],
+      "vqRdmulhQ_n", elts_same_2, [S16; S32];
+    Vqdmulh_n,
+      [Saturating; Halving; Rounding; InfoWord;
+       Instruction_name ["vqrdmulh"];
+       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+      Use_operands [| Dreg; Dreg; Corereg |],
+      "vqRdmulh_n", elts_same_2, [S16; S32];
+
+    (* Vector multiply-accumulate by scalar.  *)
+    Vmla_n, [InfoWord;
+             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+      Use_operands [| Dreg; Dreg; Corereg |], "vmla_n",
+      sign_invar_io, [S16; S32; U16; U32; F32];
+    Vmla_n, [InfoWord;
+             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+      Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n",
+      sign_invar_io, [S16; S32; U16; U32; F32];
+    Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32];
+    Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io,
+      [S16; S32];
+
+    (* Vector multiply subtract by scalar.  *)
+    Vmls_n, [InfoWord;
+             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+      Use_operands [| Dreg; Dreg; Corereg |], "vmls_n",
+      sign_invar_io, [S16; S32; U16; U32; F32];
+    Vmls_n, [InfoWord;
+             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+      Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n",
+      sign_invar_io, [S16; S32; U16; U32; F32];
+    Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32];
+    Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io,
+      [S16; S32];
+    
+    (* Vector extract.  *)
+    Vext, [Const_valuator (fun _ -> 0)],
+      Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
+      pf_su_8_64;
+    Vext, [Const_valuator (fun _ -> 0)],
+      Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
+      pf_su_8_64;
+      
+    (* Reverse elements.  *)
+    Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32;
+    Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
+    Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16];
+    Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16];
+    Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8];
+    Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8];
+
+    (* Bit selection.  *)    
+    Vbsl,
+      [Instruction_name ["vbsl"; "vbit"; "vbif"];
+       Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
+      Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
+      pf_su_8_64;
+    Vbsl,
+      [Instruction_name ["vbsl"; "vbit"; "vbif"];
+       Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
+      Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
+      pf_su_8_64;
+    
+    (* Transpose elements.  **NOTE** ReturnPtr goes some of the way towards
+       generating good code for intrinsics which return structure types --
+       builtins work well by themselves (and understand that the values being
+       stored on e.g. the stack also reside in registers, so can optimise the
+       stores away entirely if the results are used immediately), but
+       intrinsics are very much less efficient. Maybe something can be improved
+       re: inlining, or tweaking the ABI used for intrinsics (a special call
+       attribute?).
+    *)
+    Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32;
+    Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
+    
+    (* Zip elements.  *)
+    Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32;
+    Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32;
+    
+    (* Unzip elements.  *)
+    Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32;
+    Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32;
+    
+    (* Element/structure loads.  VLD1 variants.  *)
+    Vldx 1,
+      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
+      pf_su_8_64;
+    Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+					      CstPtrTo Corereg |]]],
+      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
+      pf_su_8_64;
+    
+    Vldx_lane 1,
+      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
+      "vld1_lane", bits_3, pf_su_8_32;
+    Vldx_lane 1,
+      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+                                        CstPtrTo Corereg |]];
+       Const_valuator (fun _ -> 0)],
+      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
+      "vld1_lane", bits_3, [S64; U64];
+    Vldx_lane 1,
+      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
+      "vld1Q_lane", bits_3, pf_su_8_32;
+    Vldx_lane 1,
+      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
+      "vld1Q_lane", bits_3, [S64; U64];
+    
+    Vldx_dup 1, 
+      [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
+      bits_1, pf_su_8_32;
+    Vldx_dup 1, 
+      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
+      bits_1, [S64; U64];
+    Vldx_dup 1,
+      [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
+      bits_1, pf_su_8_32;
+    Vldx_dup 1,
+      [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
+      bits_1, [S64; U64];
+   
+    (* VST1 variants.  *) 
+    Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+                                              PtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; Dreg |], "vst1",
+      store_1, pf_su_8_64;
+    Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+					      PtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
+      store_1, pf_su_8_64;
+      
+    Vstx_lane 1, 
+      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; Dreg; Immed |],
+      "vst1_lane", store_3, pf_su_8_32;
+    Vstx_lane 1, 
+      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+                                        CstPtrTo Corereg |]];
+       Const_valuator (fun _ -> 0)],
+      Use_operands [| PtrTo Corereg; Dreg; Immed |],
+      "vst1_lane", store_3, [U64; S64];
+    Vstx_lane 1,
+      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; Qreg; Immed |],
+      "vst1Q_lane", store_3, pf_su_8_32;
+    Vstx_lane 1,
+      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+                                        CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; Qreg; Immed |],
+      "vst1Q_lane", store_3, [U64; S64];
+    
+    (* VLD2 variants.  *)
+    Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+      "vld2", bits_1, pf_su_8_32;
+    Vldx 2, [Instruction_name ["vld1"]],
+       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+      "vld2", bits_1, [S64; U64];
+    Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+                                              CstPtrTo Corereg |];
+                              Use_operands [| VecArray (2, Dreg);
+					      CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |],
+      "vld2Q", bits_1, pf_su_8_32;
+    
+    Vldx_lane 2, 
+      [Disassembles_as [Use_operands
+        [| VecArray (2, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg;
+                      VecArray (2, Dreg); Immed |],
+      "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
+    Vldx_lane 2, 
+      [Disassembles_as [Use_operands
+        [| VecArray (2, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg;
+ 	              VecArray (2, Qreg); Immed |],
+      "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
+    
+    Vldx_dup 2,
+      [Disassembles_as [Use_operands
+        [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+      "vld2_dup", bits_1, pf_su_8_32;
+    Vldx_dup 2,
+      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
+        [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+      "vld2_dup", bits_1, [S64; U64];
+      
+    (* VST2 variants.  *)
+    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+                                              PtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
+      store_1, pf_su_8_32;
+    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+                                              PtrTo Corereg |]];
+             Instruction_name ["vst1"]],
+      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
+      store_1, [S64; U64];
+    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+					      PtrTo Corereg |];
+                              Use_operands [| VecArray (2, Dreg);
+				              PtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q",
+      store_1, pf_su_8_32;
+    
+    Vstx_lane 2,
+      [Disassembles_as [Use_operands
+        [| VecArray (2, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane",
+      store_3, P8 :: P16 :: F32 :: su_8_32;
+    Vstx_lane 2,
+      [Disassembles_as [Use_operands
+        [| VecArray (2, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane",
+      store_3, [P16; F32; U16; U32; S16; S32];
+
+    (* VLD3 variants.  *)
+    Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+      "vld3", bits_1, pf_su_8_32;
+    Vldx 3, [Instruction_name ["vld1"]],
+      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+      "vld3", bits_1, [S64; U64];
+    Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
+					      CstPtrTo Corereg |];
+                              Use_operands [| VecArray (3, Dreg);
+					      CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |],
+      "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
+    
+    Vldx_lane 3,
+      [Disassembles_as [Use_operands
+        [| VecArray (3, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg;
+                                     VecArray (3, Dreg); Immed |],
+      "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
+    Vldx_lane 3,
+      [Disassembles_as [Use_operands
+        [| VecArray (3, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg;
+				     VecArray (3, Qreg); Immed |],
+      "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
+    
+    Vldx_dup 3,
+      [Disassembles_as [Use_operands
+        [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+      "vld3_dup", bits_1, pf_su_8_32;
+    Vldx_dup 3,
+      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
+        [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+      "vld3_dup", bits_1, [S64; U64];
+
+    (* VST3 variants.  *) 
+    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+                                              PtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
+      store_1, pf_su_8_32;
+    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+                                              PtrTo Corereg |]];
+             Instruction_name ["vst1"]],
+      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
+      store_1, [S64; U64];
+    Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
+					      PtrTo Corereg |];
+                              Use_operands [| VecArray (3, Dreg);
+					      PtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q",
+      store_1, pf_su_8_32;
+    
+    Vstx_lane 3,
+      [Disassembles_as [Use_operands
+        [| VecArray (3, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane",
+      store_3, P8 :: P16 :: F32 :: su_8_32;
+    Vstx_lane 3,
+      [Disassembles_as [Use_operands
+        [| VecArray (3, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane",
+      store_3, [P16; F32; U16; U32; S16; S32];
+    
+    (* VLD4/VST4 variants.  *)
+    Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+      "vld4", bits_1, pf_su_8_32;
+    Vldx 4, [Instruction_name ["vld1"]],
+      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+      "vld4", bits_1, [S64; U64];
+    Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+					      CstPtrTo Corereg |];
+                              Use_operands [| VecArray (4, Dreg);
+					      CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |],
+      "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
+    
+    Vldx_lane 4,
+      [Disassembles_as [Use_operands
+        [| VecArray (4, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg;
+                                     VecArray (4, Dreg); Immed |],
+      "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
+    Vldx_lane 4,
+      [Disassembles_as [Use_operands
+        [| VecArray (4, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg;
+   	              VecArray (4, Qreg); Immed |],
+      "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
+    
+    Vldx_dup 4,
+      [Disassembles_as [Use_operands
+        [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+      "vld4_dup", bits_1, pf_su_8_32;
+    Vldx_dup 4,
+      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
+        [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
+      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+      "vld4_dup", bits_1, [S64; U64];
+      
+    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+                                              PtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
+      store_1, pf_su_8_32;
+    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+                                              PtrTo Corereg |]];
+             Instruction_name ["vst1"]],
+      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
+      store_1, [S64; U64];
+    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+					      PtrTo Corereg |];
+                              Use_operands [| VecArray (4, Dreg);
+					      PtrTo Corereg |]]],
+     Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q",
+      store_1, pf_su_8_32;
+    
+    Vstx_lane 4,
+      [Disassembles_as [Use_operands
+        [| VecArray (4, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane",
+      store_3, P8 :: P16 :: F32 :: su_8_32;
+    Vstx_lane 4,
+      [Disassembles_as [Use_operands
+        [| VecArray (4, Element_of_dreg);
+           CstPtrTo Corereg |]]],
+      Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane",
+      store_3, [P16; F32; U16; U32; S16; S32];
+
+    (* Logical operations. And.  *)
+    Vand, [], All (3, Dreg), "vand", notype_2, su_8_64;
+    Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64;
+    
+    (* Or.  *)
+    Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_64;
+    Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64;
+    
+    (* Eor.  *)
+    Veor, [], All (3, Dreg), "veor", notype_2, su_8_64;
+    Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64;
+
+    (* Bic (And-not).  *)
+    Vbic, [], All (3, Dreg), "vbic", notype_2, su_8_64;
+    Vbic, [], All (3, Qreg), "vbicQ", notype_2, su_8_64;
+    
+    (* Or-not.  *)
+    Vorn, [], All (3, Dreg), "vorn", notype_2, su_8_64;
+    Vorn, [], All (3, Qreg), "vornQ", notype_2, su_8_64;
+  ]
+
+let reinterp =
+  let elems = P8 :: P16 :: F32 :: su_8_64 in
+  List.fold_right
+    (fun convto acc ->
+      let types = List.fold_right
+        (fun convfrom acc ->
+          if convfrom <> convto then
+            Cast (convto, convfrom) :: acc
+          else
+            acc)
+        elems
+        []
+      in
+        let dconv = Vreinterp, [No_op], Use_operands [| Dreg; Dreg |],
+                      "vreinterpret", conv_1, types
+        and qconv = Vreinterp, [No_op], Use_operands [| Qreg; Qreg |],
+		      "vreinterpretQ", conv_1, types in
+        dconv :: qconv :: acc)
+    elems
+    []
+
+(* Output routines.  *)
+
+let rec string_of_elt = function
+    S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64"
+  | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64"
+  | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64"
+  | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64"
+  | F32 -> "f32" | P8 -> "p8" | P16 -> "p16"
+  | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b
+  | NoElts -> failwith "No elts"
+
+let string_of_elt_dots elt =
+  match elt with
+    Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b
+  | _ -> string_of_elt elt
+
+let string_of_vectype vt =
+  let rec name affix = function
+    T_int8x8 -> affix "int8x8"
+  | T_int8x16 -> affix "int8x16"
+  | T_int16x4 -> affix "int16x4"
+  | T_int16x8 -> affix "int16x8"
+  | T_int32x2 -> affix "int32x2"
+  | T_int32x4 -> affix "int32x4"
+  | T_int64x1 -> affix "int64x1"
+  | T_int64x2 -> affix "int64x2"
+  | T_uint8x8 -> affix "uint8x8"
+  | T_uint8x16 -> affix "uint8x16"
+  | T_uint16x4 -> affix "uint16x4"
+  | T_uint16x8 -> affix "uint16x8"
+  | T_uint32x2 -> affix "uint32x2"
+  | T_uint32x4 -> affix "uint32x4"
+  | T_uint64x1 -> affix "uint64x1"
+  | T_uint64x2 -> affix "uint64x2"
+  | T_float32x2 -> affix "float32x2"
+  | T_float32x4 -> affix "float32x4"
+  | T_poly8x8 -> affix "poly8x8"
+  | T_poly8x16 -> affix "poly8x16"
+  | T_poly16x4 -> affix "poly16x4"
+  | T_poly16x8 -> affix "poly16x8"
+  | T_int8 -> affix "int8"
+  | T_int16 -> affix "int16"
+  | T_int32 -> affix "int32"
+  | T_int64 -> affix "int64"
+  | T_uint8 -> affix "uint8"
+  | T_uint16 -> affix "uint16"
+  | T_uint32 -> affix "uint32"
+  | T_uint64 -> affix "uint64"
+  | T_poly8 -> affix "poly8"
+  | T_poly16 -> affix "poly16"
+  | T_float32 -> affix "float32"
+  | T_immediate _ -> "const int"
+  | T_void -> "void"
+  | T_intQI -> "__builtin_neon_qi"
+  | T_intHI -> "__builtin_neon_hi"
+  | T_intSI -> "__builtin_neon_si"
+  | T_intDI -> "__builtin_neon_di"
+  | T_arrayof (num, base) ->
+      let basename = name (fun x -> x) base in
+      affix (Printf.sprintf "%sx%d" basename num)
+  | T_ptrto x ->
+      let basename = name affix x in
+      Printf.sprintf "%s *" basename
+  | T_const x ->
+      let basename = name affix x in
+      Printf.sprintf "const %s" basename
+  in
+    name (fun x -> x ^ "_t") vt
+
+let string_of_inttype = function
+    B_TImode -> "__builtin_neon_ti"
+  | B_EImode -> "__builtin_neon_ei"
+  | B_OImode -> "__builtin_neon_oi"
+  | B_CImode -> "__builtin_neon_ci"
+  | B_XImode -> "__builtin_neon_xi"
+
+let string_of_mode = function
+    V8QI -> "v8qi" | V4HI  -> "v4hi"  | V2SI -> "v2si" | V2SF -> "v2sf"
+  | DI   -> "di"   | V16QI -> "v16qi" | V8HI -> "v8hi" | V4SI -> "v4si"
+  | V4SF -> "v4sf" | V2DI  -> "v2di"  | QI -> "qi" | HI -> "hi" | SI -> "si"
+  | SF -> "sf"
+
+(* Use uppercase chars for letters which form part of the intrinsic name, but
+   should be omitted from the builtin name (the info is passed in an extra
+   argument, instead).  *)
+let intrinsic_name name = String.lowercase name
+
+(* Allow the name of the builtin to be overridden by things (e.g. Flipped)
+   found in the features list.  *)
+let builtin_name features name =
+  let name = List.fold_right
+               (fun el name ->
+                 match el with
+                   Flipped x | Builtin_name x -> x
+                 | _ -> name)
+               features name in
+  let islower x = let str = String.make 1 x in (String.lowercase str) = str
+  and buf = Buffer.create (String.length name) in
+  String.iter (fun c -> if islower c then Buffer.add_char buf c) name;
+  Buffer.contents buf
+
+(* Transform an arity into a list of strings.  *)
+let strings_of_arity a =
+  match a with
+  | Arity0 vt -> [string_of_vectype vt]
+  | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2]
+  | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1;
+			       string_of_vectype vt2;
+                               string_of_vectype vt3]
+  | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1;
+                                    string_of_vectype vt2;
+                                    string_of_vectype vt3;
+                                    string_of_vectype vt4]
+  | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1;
+                                         string_of_vectype vt2;
+                                         string_of_vectype vt3;
+                                         string_of_vectype vt4;
+                                         string_of_vectype vt5]
+
+(* Suffixes on the end of builtin names that are to be stripped in order
+   to obtain the name used as an instruction.  They are only stripped if
+   preceded immediately by an underscore.  *)
+let suffixes_to_strip = [ "n"; "lane"; "dup" ]
+
+(* Get the possible names of an instruction corresponding to a "name" from the
+   ops table.  This is done by getting the equivalent builtin name and
+   stripping any suffixes from the list at the top of this file, unless
+   the features list presents with an Instruction_name entry, in which
+   case that is used; or unless the features list presents with a Flipped
+   entry, in which case that is used.  If both such entries are present,
+   the first in the list will be chosen.  *)
+let get_insn_names features name =
+  let names = try
+  begin
+    match List.find (fun feature -> match feature with
+                                      Instruction_name _ -> true
+				    | Flipped _ -> true
+				    | _ -> false) features
+    with
+      Instruction_name names -> names
+    | Flipped name -> [name]
+    | _ -> assert false
+  end
+  with Not_found -> [builtin_name features name]
+  in
+  begin
+    List.map (fun name' ->
+      try
+        let underscore = String.rindex name' '_' in
+        let our_suffix = String.sub name' (underscore + 1)
+                                    ((String.length name') - underscore - 1)
+        in
+          let rec strip remaining_suffixes =
+            match remaining_suffixes with
+              [] -> name'
+            | s::ss when our_suffix = s -> String.sub name' 0 underscore
+            | _::ss -> strip ss
+          in
+            strip suffixes_to_strip
+      with (Not_found | Invalid_argument _) -> name') names
+  end
+
+(* Apply a function to each element of a list and then comma-separate
+   the resulting strings.  *)
+let rec commas f elts acc =
+  match elts with
+    [] -> acc
+  | [elt] -> acc ^ (f elt)
+  | elt::elts ->
+    commas f elts (acc ^ (f elt) ^ ", ")
+
+(* Given a list of features and the shape specified in the "ops" table, apply
+   a function to each possible shape that the instruction may have.
+   By default, this is the "shape" entry in "ops".  If the features list
+   contains a Disassembles_as entry, the shapes contained in that entry are
+   mapped to corresponding outputs and returned in a list.  If there is more
+   than one Disassembles_as entry, only the first is used.  *)
+let analyze_all_shapes features shape f =
+  try
+    match List.find (fun feature ->
+                       match feature with Disassembles_as _ -> true
+                                        | _ -> false)
+                    features with
+      Disassembles_as shapes -> List.map f shapes
+    | _ -> assert false
+  with Not_found -> [f shape]
+

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c Wed Jul 22 15:36:27 2009
@@ -282,13 +282,25 @@
 	    }
 	  if (op == 0xc8)
 	    {
-	      /* Pop FPA registers.  */
-	      op = next_unwind_byte (uws);
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#ifndef __VFP_FP__
+ 	      /* Pop FPA registers.  */
+ 	      op = next_unwind_byte (uws);
 	      op = ((op & 0xf0) << 12) | ((op & 0xf) + 1);
-	      if (_Unwind_VRS_Pop (context, _UVRSC_FPA, op, _UVRSD_FPAX)
-		  != _UVRSR_OK)
-		return _URC_FAILURE;
-	      continue;
+ 	      if (_Unwind_VRS_Pop (context, _UVRSC_FPA, op, _UVRSD_FPAX)
+ 		  != _UVRSR_OK)
+ 		return _URC_FAILURE;
+ 	      continue;
+#else
+              /* Pop VFPv3 registers D[16+ssss]-D[16+ssss+cccc] with vldm.  */
+              op = next_unwind_byte (uws);
+              op = (((op & 0xf0) + 16) << 12) | ((op & 0xf) + 1);
+              if (_Unwind_VRS_Pop (context, _UVRSC_VFP, op, _UVRSD_DOUBLE)
+                  != _UVRSR_OK)
+                return _URC_FAILURE;
+              continue;
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
 	    }
 	  if (op == 0xc9)
 	    {
@@ -379,3 +391,19 @@
   return ptr;
 }
 
+/* APPLE LOCAL begin v7 support. Merge from Codesourcery */
+
+/* These two should never be used.  */
+
+_Unwind_Ptr
+_Unwind_GetDataRelBase (_Unwind_Context *context __attribute__ ((unused)))
+{
+  abort ();
+}
+
+_Unwind_Ptr
+_Unwind_GetTextRelBase (_Unwind_Context *context __attribute__ ((unused)))
+{
+  abort ();
+}
+/* APPLE LOCAL end v7 support. Merge from Codesourcery */

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md Wed Jul 22 15:36:27 2009
@@ -39,6 +39,18 @@
   return REGNO (op) < FIRST_PSEUDO_REGISTER;
 })
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; A low register.
+(define_predicate "low_register_operand"
+  (and (match_code "reg")
+       (match_test "REGNO (op) <= LAST_LO_REGNUM")))
+
+;; A low register or const_int.
+(define_predicate "low_reg_or_int_operand"
+  (ior (match_code "const_int")
+       (match_operand 0 "low_register_operand")))
+;; APPLE LOCAL end v7 support. Merge from mainline
+
 ;; Any core register, or any pseudo.  */ 
 (define_predicate "arm_general_register_operand"
   (match_code "reg,subreg")
@@ -158,6 +170,16 @@
 			 || (GET_CODE (op) == REG
 			     && REGNO (op) >= FIRST_PSEUDO_REGISTER)))")))
 
+;; APPLE LOCAL begin 6160917
+;; Allow any mem reference through here.  By doing this, instead of just
+;; ignoring unhandled cases in SECONDARY_*_RELOAD_CLASS macros we will
+;; get an assertion failure in neon_reload_{in,out}.
+;; We don't use memory_operand because it fails for out-of-range
+;; indexed addressing.
+(define_predicate "neon_reload_mem_operand"
+  (match_code "mem"))
+;; APPLE LOCAL end 6160917
+
 ;; True for valid operands for the rhs of an floating point insns.
 ;;   Allows regs or certain consts on FPA, just regs for everything else.
 (define_predicate "arm_float_rhs_operand"
@@ -215,6 +237,12 @@
 	    (match_code "ashift,ashiftrt,lshiftrt,rotatert"))
        (match_test "mode == GET_MODE (op)")))
 
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; True for operators that have 16-bit thumb variants.  */
+(define_special_predicate "thumb_16bit_operator"
+  (match_code "plus,minus,and,ior,xor"))
+;; APPLE LOCAL end v7 support. Merge from mainline
+
 ;; True for EQ & NE
 (define_special_predicate "equality_operator"
   (match_code "eq,ne"))
@@ -440,13 +468,15 @@
 ;; Thumb predicates
 ;;
 
-(define_predicate "thumb_cmp_operand"
+;; APPLE LOCAL v7 support. Merge from mainline
+(define_predicate "thumb1_cmp_operand"
   (ior (and (match_code "reg,subreg")
 	    (match_operand 0 "s_register_operand"))
        (and (match_code "const_int")
 	    (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 256"))))
 
-(define_predicate "thumb_cmpneg_operand"
+;; APPLE LOCAL v7 support. Merge from mainline
+(define_predicate "thumb1_cmpneg_operand"
   (and (match_code "const_int")
        (match_test "INTVAL (op) < 0 && INTVAL (op) > -256")))
 
@@ -496,6 +526,51 @@
   (and (match_code "const_int")
        (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 64")))
 
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+
+;; Neon predicates
+
+(define_predicate "const_multiple_of_8_operand"
+  (match_code "const_int")
+{
+  unsigned HOST_WIDE_INT val = INTVAL (op);
+  return (val & 7) == 0;
+})
+
+(define_predicate "imm_for_neon_mov_operand"
+  (match_code "const_vector")
+{
+  return neon_immediate_valid_for_move (op, mode, NULL, NULL);
+})
+
+(define_predicate "imm_for_neon_logic_operand"
+  (match_code "const_vector")
+{
+  return neon_immediate_valid_for_logic (op, mode, 0, NULL, NULL);
+})
+
+(define_predicate "imm_for_neon_inv_logic_operand"
+  (match_code "const_vector")
+{
+  return neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL);
+})
+
+(define_predicate "neon_logic_op2"
+  (ior (match_operand 0 "imm_for_neon_logic_operand")
+       (match_operand 0 "s_register_operand")))
+
+(define_predicate "neon_inv_logic_op2"
+  (ior (match_operand 0 "imm_for_neon_inv_logic_operand")
+       (match_operand 0 "s_register_operand")))
+
+;; TODO: We could check lane numbers more precisely based on the mode.
+(define_predicate "neon_lane_number"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 7")))
+
+
+
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
 ;; APPLE LOCAL begin ARM pic support
 ;; Allow local symbols and stub references
 (define_predicate "arm_branch_target"

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-arm
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-arm?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-arm (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-arm Wed Jul 22 15:36:27 2009
@@ -1,5 +1,6 @@
 # Rules common to all arm targets
 
+# APPLE LOCAL begin v7 support. Merge from Codesourcery
 MD_INCLUDES= 	$(srcdir)/config/arm/arm-tune.md \
 		$(srcdir)/config/arm/predicates.md \
 		$(srcdir)/config/arm/arm-generic.md \
@@ -9,8 +10,13 @@
 		$(srcdir)/config/arm/arm926ejs.md \
 		$(srcdir)/config/arm/cirrus.md \
 		$(srcdir)/config/arm/fpa.md \
+		$(srcdir)/config/arm/vec-common.md \
 		$(srcdir)/config/arm/iwmmxt.md \
-		$(srcdir)/config/arm/vfp.md
+		$(srcdir)/config/arm/vfp.md \
+		$(srcdir)/config/arm/neon.md \
+		$(srcdir)/config/arm/thumb2.md \
+		$(srcdir)/config/arm/hwdiv.md
+# APPLE LOCAL end v7 support. Merge from Codesourcery
 
 s-config s-conditions s-flags s-codes s-constants s-emit s-recog s-preds \
 	s-opinit s-extract s-peep s-attr s-attrtab s-output: $(MD_INCLUDES)

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf Wed Jul 22 15:36:27 2009
@@ -11,6 +11,18 @@
 MULTILIB_EXCEPTIONS  = 
 MULTILIB_MATCHES     =
 
+# APPLE LOCAL begin v7 support. Merge from mainline
+#MULTILIB_OPTIONS      += march=armv7
+#MULTILIB_DIRNAMES     += thumb2
+#MULTILIB_EXCEPTIONS   += march=armv7* marm/*march=armv7*
+#MULTILIB_MATCHES      += march?armv7=march?armv7-a
+#MULTILIB_MATCHES      += march?armv7=march?armv7-r
+#MULTILIB_MATCHES      += march?armv7=march?armv7-m
+#MULTILIB_MATCHES      += march?armv7=mcpu?cortex-a8
+#MULTILIB_MATCHES      += march?armv7=mcpu?cortex-r4
+#MULTILIB_MATCHES      += march?armv7=mcpu?cortex-m3
+# APPLE LOCAL end v7 support. Merge from mainline
+
 # MULTILIB_OPTIONS    += mcpu=ep9312
 # MULTILIB_DIRNAMES   += ep9312
 # MULTILIB_EXCEPTIONS += *mthumb/*mcpu=ep9312*

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin Wed Jul 22 15:36:27 2009
@@ -24,9 +24,19 @@
 	$(srcdir)/config/arm/_fixunssfdi.c 
 # APPLE LOCAL end 5316398 improved float/double -> int64 functions
 
-MULTILIB_OPTIONS     = march=armv6k
-MULTILIB_DIRNAMES    = v6
-MULTILIB_EXCEPTIONS  = 
+# APPLE LOCAL begin 6611402 configurable multilib architectures
+ifndef ARM_MULTILIB_ARCHS
+ARM_MULTILIB_ARCHS:=armv5 armv6 armv7
+endif
+
+MULTILIB_OPTIONS:=$(shell echo $(ARM_MULTILIB_ARCHS) | \
+  sed -e s/armv5/march=armv5tej/ \
+      -e s/armv6/march=armv6k/ \
+      -e s/armv7/march=armv7a/)
+MULTILIB_DIRNAMES:=$(shell echo $(ARM_MULTILIB_ARCHS) | sed -e s/arm//g)
+MULTILIB_EXCEPTIONS:=$(shell $(srcdir)/config/arm/gen-darwin-multilib-exceptions.sh $(ARM_MULTILIB_ARCHS))
+# APPLE LOCAL end 6611402 configurable multilib architectures
+
 MULTILIB_MATCHES     =
 TARGET_LIBGCC2_CFLAGS = -fno-inline
 

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-pe
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-pe?rev=76781&r1=76780&r2=76781&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-pe (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-pe Wed Jul 22 15:36:27 2009
@@ -29,4 +29,5 @@
 
 LIBGCC = stmp-multilib
 INSTALL_LIBGCC = install-multilib
-TARGET_LIBGCC2_CFLAGS = 
\ No newline at end of file
+# APPLE LOCAL v7 support
+TARGET_LIBGCC2_CFLAGS =