r201848 - ARM & AArch64: use table for EmitCommonNeonBuiltinExpr

Tim Northover tnorthover at apple.com
Fri Feb 21 03:57:25 PST 2014


Author: tnorthover
Date: Fri Feb 21 05:57:24 2014
New Revision: 201848

URL: http://llvm.org/viewvc/llvm-project?rev=201848&view=rev
Log:
ARM & AArch64: use table for EmitCommonNeonBuiltinExpr

This extends the intrinsic lookup table format slightly, and adds
entries for use the shared ARM/AArch64 definitions. The benefit is
currently smaller than for the SISD intrinsics (there's more custom
code implementing this set), but a few lines are saved and there's
scope for future expansion.

Modified:
    cfe/trunk/lib/CodeGen/CGBuiltin.cpp
    cfe/trunk/lib/CodeGen/CodeGenFunction.h

Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=201848&r1=201847&r2=201848&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Fri Feb 21 05:57:24 2014
@@ -1754,18 +1754,858 @@ CodeGenFunction::EmitPointerWithAlignmen
   return std::make_pair(EmitScalarExpr(Addr), Align);
 }
 
-Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(unsigned BuiltinID,
-                                                  const CallExpr *E,
-                                                  SmallVectorImpl<Value *> &Ops,
-                                                  llvm::Value *Align) {
+enum {
+  AddRetType = (1 << 0),
+  Add1ArgType = (1 << 1),
+  Add2ArgTypes = (1 << 2),
+
+  VectorizeRetType = (1 << 3),
+  VectorizeArgTypes = (1 << 4),
+
+  InventFloatType = (1 << 5),
+  UnsignedAlts = (1 << 6),
+
+  Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
+  VectorRet = AddRetType | VectorizeRetType,
+  VectorRetGetArgs01 =
+      AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
+  FpCmpzModifiers =
+      AddRetType | VectorizeRetType | Add1ArgType | InventFloatType,
+};
+
+ struct NeonIntrinsicInfo {
+  unsigned BuiltinID;
+  unsigned LLVMIntrinsic;
+  unsigned AltLLVMIntrinsic;
+  const char *NameHint;
+  unsigned TypeModifier;
+
+  bool operator<(unsigned RHSBuiltinID) const {
+    return BuiltinID < RHSBuiltinID;
+  }
+};
+
+#define NEONMAP0(NameBase) \
+  { NEON::BI__builtin_neon_ ## NameBase, 0, 0, #NameBase, 0 }
+
+#define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
+  { NEON:: BI__builtin_neon_ ## NameBase, \
+      Intrinsic::LLVMIntrinsic, 0, #NameBase, TypeModifier }
+
+#define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
+  { NEON:: BI__builtin_neon_ ## NameBase, \
+      Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
+      #NameBase, TypeModifier }
+
+static const NeonIntrinsicInfo AArch64SISDIntrinsicInfo[] = {
+  NEONMAP1(vabdd_f64, aarch64_neon_vabd, AddRetType),
+  NEONMAP1(vabds_f32, aarch64_neon_vabd, AddRetType),
+  NEONMAP1(vabsd_s64, aarch64_neon_vabs, 0),
+  NEONMAP1(vaddd_s64, aarch64_neon_vaddds, 0),
+  NEONMAP1(vaddd_u64, aarch64_neon_vadddu, 0),
+  NEONMAP1(vaddlv_s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vaddv_s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_s64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vcaged_f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcages_f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcagtd_f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcagts_f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcaled_f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcales_f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcaltd_f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcalts_f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vceqd_f64, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
+  NEONMAP1(vceqd_s64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqd_u64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqs_f32, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
+  NEONMAP1(vceqzd_f64, aarch64_neon_fceq, FpCmpzModifiers),
+  NEONMAP1(vceqzd_s64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqzd_u64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqzs_f32, aarch64_neon_fceq, FpCmpzModifiers),
+  NEONMAP1(vcged_f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcged_s64, aarch64_neon_vcge, VectorRetGetArgs01),
+  NEONMAP1(vcged_u64, aarch64_neon_vchs, VectorRetGetArgs01),
+  NEONMAP1(vcges_f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcgezd_f64, aarch64_neon_fcge, FpCmpzModifiers),
+  NEONMAP1(vcgezd_s64, aarch64_neon_vcge, VectorRetGetArgs01),
+  NEONMAP1(vcgezs_f32, aarch64_neon_fcge, FpCmpzModifiers),
+  NEONMAP1(vcgtd_f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcgtd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
+  NEONMAP1(vcgtd_u64, aarch64_neon_vchi, VectorRetGetArgs01),
+  NEONMAP1(vcgts_f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcgtzd_f64, aarch64_neon_fcgt, FpCmpzModifiers),
+  NEONMAP1(vcgtzd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
+  NEONMAP1(vcgtzs_f32, aarch64_neon_fcgt, FpCmpzModifiers),
+  NEONMAP1(vcled_f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcled_s64, aarch64_neon_vcge, VectorRetGetArgs01),
+  NEONMAP1(vcled_u64, aarch64_neon_vchs, VectorRetGetArgs01),
+  NEONMAP1(vcles_f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vclezd_f64, aarch64_neon_fclez, FpCmpzModifiers),
+  NEONMAP1(vclezd_s64, aarch64_neon_vclez, VectorRetGetArgs01),
+  NEONMAP1(vclezs_f32, aarch64_neon_fclez, FpCmpzModifiers),
+  NEONMAP1(vcltd_f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcltd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
+  NEONMAP1(vcltd_u64, aarch64_neon_vchi, VectorRetGetArgs01),
+  NEONMAP1(vclts_f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcltzd_f64, aarch64_neon_fcltz, FpCmpzModifiers),
+  NEONMAP1(vcltzd_s64, aarch64_neon_vcltz, VectorRetGetArgs01),
+  NEONMAP1(vcltzs_f32, aarch64_neon_fcltz, FpCmpzModifiers),
+  NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_f64_s64, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_f64_u64, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_f32_s32, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_f32_u32, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtxd_f32_f64, aarch64_neon_fcvtxn, 0),
+  NEONMAP0(vdupb_lane_i8),
+  NEONMAP0(vdupb_laneq_i8),
+  NEONMAP0(vdupd_lane_f64),
+  NEONMAP0(vdupd_lane_i64),
+  NEONMAP0(vdupd_laneq_f64),
+  NEONMAP0(vdupd_laneq_i64),
+  NEONMAP0(vduph_lane_i16),
+  NEONMAP0(vduph_laneq_i16),
+  NEONMAP0(vdups_lane_f32),
+  NEONMAP0(vdups_lane_i32),
+  NEONMAP0(vdups_laneq_f32),
+  NEONMAP0(vdups_laneq_i32),
+  NEONMAP0(vfmad_lane_f64),
+  NEONMAP0(vfmad_laneq_f64),
+  NEONMAP0(vfmas_lane_f32),
+  NEONMAP0(vfmas_laneq_f32),
+  NEONMAP0(vget_lane_f32),
+  NEONMAP0(vget_lane_f64),
+  NEONMAP0(vget_lane_i16),
+  NEONMAP0(vget_lane_i32),
+  NEONMAP0(vget_lane_i64),
+  NEONMAP0(vget_lane_i8),
+  NEONMAP0(vgetq_lane_f32),
+  NEONMAP0(vgetq_lane_f64),
+  NEONMAP0(vgetq_lane_i16),
+  NEONMAP0(vgetq_lane_i32),
+  NEONMAP0(vgetq_lane_i64),
+  NEONMAP0(vgetq_lane_i8),
+  NEONMAP1(vmaxnmv_f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxnmvq_f32, aarch64_neon_vmaxnmv, 0),
+  NEONMAP1(vmaxnmvq_f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_f32, aarch64_neon_vmaxv, 0),
+  NEONMAP1(vmaxvq_f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxvq_s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vminnmv_f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vminnmvq_f32, aarch64_neon_vminnmv, 0),
+  NEONMAP1(vminnmvq_f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_f32, aarch64_neon_vminv, 0),
+  NEONMAP1(vminvq_f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vminvq_s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP0(vmul_n_f64),
+  NEONMAP1(vmull_p64, aarch64_neon_vmull_p64, 0),
+  NEONMAP0(vmulxd_f64),
+  NEONMAP0(vmulxs_f32),
+  NEONMAP1(vnegd_s64, aarch64_neon_vneg, 0),
+  NEONMAP1(vpaddd_f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vpaddd_s64, aarch64_neon_vpadd, 0),
+  NEONMAP1(vpaddd_u64, aarch64_neon_vpadd, 0),
+  NEONMAP1(vpadds_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxnmqd_f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxnms_f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxqd_f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxs_f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vpminnmqd_f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpminnms_f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpminqd_f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vpmins_f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vqabsb_s8, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqabsd_s64, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqabsh_s16, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqabss_s32, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqaddb_s8, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqaddb_u8, arm_neon_vqaddu, VectorRet),
+  NEONMAP1(vqaddd_s64, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqaddd_u64, arm_neon_vqaddu, VectorRet),
+  NEONMAP1(vqaddh_s16, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqaddh_u16, arm_neon_vqaddu, VectorRet),
+  NEONMAP1(vqadds_s32, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqadds_u32, arm_neon_vqaddu, VectorRet),
+  NEONMAP0(vqdmlalh_lane_s16),
+  NEONMAP0(vqdmlalh_laneq_s16),
+  NEONMAP1(vqdmlalh_s16, aarch64_neon_vqdmlal, VectorRet),
+  NEONMAP0(vqdmlals_lane_s32),
+  NEONMAP0(vqdmlals_laneq_s32),
+  NEONMAP1(vqdmlals_s32, aarch64_neon_vqdmlal, VectorRet),
+  NEONMAP0(vqdmlslh_lane_s16),
+  NEONMAP0(vqdmlslh_laneq_s16),
+  NEONMAP1(vqdmlslh_s16, aarch64_neon_vqdmlsl, VectorRet),
+  NEONMAP0(vqdmlsls_lane_s32),
+  NEONMAP0(vqdmlsls_laneq_s32),
+  NEONMAP1(vqdmlsls_s32, aarch64_neon_vqdmlsl, VectorRet),
+  NEONMAP1(vqdmulhh_s16, arm_neon_vqdmulh, VectorRet),
+  NEONMAP1(vqdmulhs_s32, arm_neon_vqdmulh, VectorRet),
+  NEONMAP1(vqdmullh_s16, arm_neon_vqdmull, VectorRet),
+  NEONMAP1(vqdmulls_s32, arm_neon_vqdmull, VectorRet),
+  NEONMAP1(vqmovnd_s64, arm_neon_vqmovns, VectorRet),
+  NEONMAP1(vqmovnd_u64, arm_neon_vqmovnu, VectorRet),
+  NEONMAP1(vqmovnh_s16, arm_neon_vqmovns, VectorRet),
+  NEONMAP1(vqmovnh_u16, arm_neon_vqmovnu, VectorRet),
+  NEONMAP1(vqmovns_s32, arm_neon_vqmovns, VectorRet),
+  NEONMAP1(vqmovns_u32, arm_neon_vqmovnu, VectorRet),
+  NEONMAP1(vqmovund_s64, arm_neon_vqmovnsu, VectorRet),
+  NEONMAP1(vqmovunh_s16, arm_neon_vqmovnsu, VectorRet),
+  NEONMAP1(vqmovuns_s32, arm_neon_vqmovnsu, VectorRet),
+  NEONMAP1(vqnegb_s8, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqnegd_s64, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqnegh_s16, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqnegs_s32, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqrdmulhh_s16, arm_neon_vqrdmulh, VectorRet),
+  NEONMAP1(vqrdmulhs_s32, arm_neon_vqrdmulh, VectorRet),
+  NEONMAP1(vqrshlb_s8, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshlb_u8, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshld_s64, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshld_u64, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshlh_s16, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshlh_u16, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshls_s32, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshls_u32, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshrnd_n_s64, aarch64_neon_vsqrshrn, VectorRet),
+  NEONMAP1(vqrshrnd_n_u64, aarch64_neon_vuqrshrn, VectorRet),
+  NEONMAP1(vqrshrnh_n_s16, aarch64_neon_vsqrshrn, VectorRet),
+  NEONMAP1(vqrshrnh_n_u16, aarch64_neon_vuqrshrn, VectorRet),
+  NEONMAP1(vqrshrns_n_s32, aarch64_neon_vsqrshrn, VectorRet),
+  NEONMAP1(vqrshrns_n_u32, aarch64_neon_vuqrshrn, VectorRet),
+  NEONMAP1(vqrshrund_n_s64, aarch64_neon_vsqrshrun, VectorRet),
+  NEONMAP1(vqrshrunh_n_s16, aarch64_neon_vsqrshrun, VectorRet),
+  NEONMAP1(vqrshruns_n_s32, aarch64_neon_vsqrshrun, VectorRet),
+  NEONMAP1(vqshlb_n_s8, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshlb_n_u8, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshlb_s8, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshlb_u8, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshld_n_s64, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshld_n_u64, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshld_s64, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshld_u64, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshlh_n_s16, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshlh_n_u16, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshlh_s16, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshlh_u16, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshls_n_s32, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshls_n_u32, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshls_s32, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshls_u32, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshlub_n_s8, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshlud_n_s64, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshluh_n_s16, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshlus_n_s32, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshrnd_n_s64, aarch64_neon_vsqshrn, VectorRet),
+  NEONMAP1(vqshrnd_n_u64, aarch64_neon_vuqshrn, VectorRet),
+  NEONMAP1(vqshrnh_n_s16, aarch64_neon_vsqshrn, VectorRet),
+  NEONMAP1(vqshrnh_n_u16, aarch64_neon_vuqshrn, VectorRet),
+  NEONMAP1(vqshrns_n_s32, aarch64_neon_vsqshrn, VectorRet),
+  NEONMAP1(vqshrns_n_u32, aarch64_neon_vuqshrn, VectorRet),
+  NEONMAP1(vqshrund_n_s64, aarch64_neon_vsqshrun, VectorRet),
+  NEONMAP1(vqshrunh_n_s16, aarch64_neon_vsqshrun, VectorRet),
+  NEONMAP1(vqshruns_n_s32, aarch64_neon_vsqshrun, VectorRet),
+  NEONMAP1(vqsubb_s8, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubb_u8, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vqsubd_s64, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubd_u64, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vqsubh_s16, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubh_u16, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vqsubs_s32, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubs_u32, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vrecped_f64, aarch64_neon_vrecpe, AddRetType),
+  NEONMAP1(vrecpes_f32, aarch64_neon_vrecpe, AddRetType),
+  NEONMAP1(vrecpsd_f64, aarch64_neon_vrecps, AddRetType),
+  NEONMAP1(vrecpss_f32, aarch64_neon_vrecps, AddRetType),
+  NEONMAP1(vrecpxd_f64, aarch64_neon_vrecpx, AddRetType),
+  NEONMAP1(vrecpxs_f32, aarch64_neon_vrecpx, AddRetType),
+  NEONMAP1(vrshld_s64, aarch64_neon_vrshlds, 0),
+  NEONMAP1(vrshld_u64, aarch64_neon_vrshldu, 0),
+  NEONMAP1(vrshrd_n_s64, aarch64_neon_vsrshr, VectorRet),
+  NEONMAP1(vrshrd_n_u64, aarch64_neon_vurshr, VectorRet),
+  NEONMAP1(vrsqrted_f64, aarch64_neon_vrsqrte, AddRetType),
+  NEONMAP1(vrsqrtes_f32, aarch64_neon_vrsqrte, AddRetType),
+  NEONMAP1(vrsqrtsd_f64, aarch64_neon_vrsqrts, AddRetType),
+  NEONMAP1(vrsqrtss_f32, aarch64_neon_vrsqrts, AddRetType),
+  NEONMAP1(vrsrad_n_s64, aarch64_neon_vrsrads_n, 0),
+  NEONMAP1(vrsrad_n_u64, aarch64_neon_vrsradu_n, 0),
+  NEONMAP0(vset_lane_f32),
+  NEONMAP0(vset_lane_f64),
+  NEONMAP0(vset_lane_i16),
+  NEONMAP0(vset_lane_i32),
+  NEONMAP0(vset_lane_i64),
+  NEONMAP0(vset_lane_i8),
+  NEONMAP0(vsetq_lane_f32),
+  NEONMAP0(vsetq_lane_f64),
+  NEONMAP0(vsetq_lane_i16),
+  NEONMAP0(vsetq_lane_i32),
+  NEONMAP0(vsetq_lane_i64),
+  NEONMAP0(vsetq_lane_i8),
+  NEONMAP1(vsha1cq_u32, arm_neon_sha1c, 0),
+  NEONMAP1(vsha1h_u32, arm_neon_sha1h, 0),
+  NEONMAP1(vsha1mq_u32, arm_neon_sha1m, 0),
+  NEONMAP1(vsha1pq_u32, arm_neon_sha1p, 0),
+  NEONMAP1(vshld_n_s64, aarch64_neon_vshld_n, 0),
+  NEONMAP1(vshld_n_u64, aarch64_neon_vshld_n, 0),
+  NEONMAP1(vshld_s64, aarch64_neon_vshlds, 0),
+  NEONMAP1(vshld_u64, aarch64_neon_vshldu, 0),
+  NEONMAP1(vshrd_n_s64, aarch64_neon_vshrds_n, 0),
+  NEONMAP1(vshrd_n_u64, aarch64_neon_vshrdu_n, 0),
+  NEONMAP1(vslid_n_s64, aarch64_neon_vsli, VectorRet),
+  NEONMAP1(vslid_n_u64, aarch64_neon_vsli, VectorRet),
+  NEONMAP1(vsqaddb_u8, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsqaddd_u64, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsqaddh_u16, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsqadds_u32, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsrad_n_s64, aarch64_neon_vsrads_n, 0),
+  NEONMAP1(vsrad_n_u64, aarch64_neon_vsradu_n, 0),
+  NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, VectorRet),
+  NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, VectorRet),
+  NEONMAP1(vsubd_s64, aarch64_neon_vsubds, 0),
+  NEONMAP1(vsubd_u64, aarch64_neon_vsubdu, 0),
+  NEONMAP1(vtstd_s64, aarch64_neon_vtstd, VectorRetGetArgs01),
+  NEONMAP1(vtstd_u64, aarch64_neon_vtstd, VectorRetGetArgs01),
+  NEONMAP1(vuqaddb_s8, aarch64_neon_vuqadd, VectorRet),
+  NEONMAP1(vuqaddd_s64, aarch64_neon_vuqadd, VectorRet),
+  NEONMAP1(vuqaddh_s16, aarch64_neon_vuqadd, VectorRet),
+  NEONMAP1(vuqadds_s32, aarch64_neon_vuqadd, VectorRet)
+};
+
+static NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
+  NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vabs_v, arm_neon_vabs, 0),
+  NEONMAP1(vabsq_v, arm_neon_vabs, 0),
+  NEONMAP0(vaddhn_v),
+  NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
+  NEONMAP1(vaeseq_v, arm_neon_aese, 0),
+  NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
+  NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
+  NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
+  NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
+  NEONMAP1(vcage_v, arm_neon_vacge, 0),
+  NEONMAP1(vcageq_v, arm_neon_vacge, 0),
+  NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcale_v, arm_neon_vacge, 0),
+  NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
+  NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
+  NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
+  NEONMAP1(vclz_v, ctlz, Add1ArgType),
+  NEONMAP1(vclzq_v, ctlz, Add1ArgType),
+  NEONMAP1(vcnt_v, ctpop, Add1ArgType),
+  NEONMAP1(vcntq_v, ctpop, Add1ArgType),
+  NEONMAP1(vcvt_f16_v, arm_neon_vcvtfp2hf, 0),
+  NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
+  NEONMAP0(vcvt_f32_v),
+  NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
+  NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP0(vcvt_s32_v),
+  NEONMAP0(vcvt_s64_v),
+  NEONMAP0(vcvt_u32_v),
+  NEONMAP0(vcvt_u64_v),
+  NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
+  NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
+  NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
+  NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
+  NEONMAP0(vcvtq_f32_v),
+  NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
+  NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP0(vcvtq_s32_v),
+  NEONMAP0(vcvtq_s64_v),
+  NEONMAP0(vcvtq_u32_v),
+  NEONMAP0(vcvtq_u64_v),
+  NEONMAP0(vext_v),
+  NEONMAP0(vextq_v),
+  NEONMAP0(vfma_v),
+  NEONMAP0(vfmaq_v),
+  NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vld1_dup_v),
+  NEONMAP1(vld1_v, arm_neon_vld1, 0),
+  NEONMAP0(vld1q_dup_v),
+  NEONMAP1(vld1q_v, arm_neon_vld1, 0),
+  NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
+  NEONMAP1(vld2_v, arm_neon_vld2, 0),
+  NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
+  NEONMAP1(vld2q_v, arm_neon_vld2, 0),
+  NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
+  NEONMAP1(vld3_v, arm_neon_vld3, 0),
+  NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
+  NEONMAP1(vld3q_v, arm_neon_vld3, 0),
+  NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
+  NEONMAP1(vld4_v, arm_neon_vld4, 0),
+  NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
+  NEONMAP1(vld4q_v, arm_neon_vld4, 0),
+  NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vmovl_v),
+  NEONMAP0(vmovn_v),
+  NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
+  NEONMAP0(vmull_v),
+  NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
+  NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
+  NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
+  NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
+  NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
+  NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
+  NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
+  NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
+  NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
+  NEONMAP2(vqadd_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqaddq_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqdmlal_v, arm_neon_vqdmull, arm_neon_vqadds, 0),
+  NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, arm_neon_vqsubs, 0),
+  NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
+  NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
+  NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
+  NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
+  NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
+  NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
+  NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
+  NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
+  NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
+  NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
+  NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsub_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsubq_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
+  NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
+  NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
+  NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
+  NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
+  NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
+  NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
+  NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
+  NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
+  NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
+  NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0),
+  NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0),
+  NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0),
+  NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0),
+  NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0),
+  NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0),
+  NEONMAP0(vshl_n_v),
+  NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vshll_n_v),
+  NEONMAP0(vshlq_n_v),
+  NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vshr_n_v),
+  NEONMAP0(vshrn_n_v),
+  NEONMAP0(vshrq_n_v),
+  NEONMAP1(vst1_v, arm_neon_vst1, 0),
+  NEONMAP1(vst1q_v, arm_neon_vst1, 0),
+  NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
+  NEONMAP1(vst2_v, arm_neon_vst2, 0),
+  NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
+  NEONMAP1(vst2q_v, arm_neon_vst2, 0),
+  NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
+  NEONMAP1(vst3_v, arm_neon_vst3, 0),
+  NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
+  NEONMAP1(vst3q_v, arm_neon_vst3, 0),
+  NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
+  NEONMAP1(vst4_v, arm_neon_vst4, 0),
+  NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
+  NEONMAP1(vst4q_v, arm_neon_vst4, 0),
+  NEONMAP0(vsubhn_v),
+  NEONMAP0(vtrn_v),
+  NEONMAP0(vtrnq_v),
+  NEONMAP0(vtst_v),
+  NEONMAP0(vtstq_v),
+  NEONMAP0(vuzp_v),
+  NEONMAP0(vuzpq_v),
+  NEONMAP0(vzip_v),
+  NEONMAP0(vzipq_v),
+};
+
+#undef NEONMAP0
+#undef NEONMAP1
+#undef NEONMAP2
+
+static bool NEONSIMDIntrinsicsProvenSorted = false;
+
+static bool AArch64SISDIntrinsicInfoProvenSorted = false;
+
+static const NeonIntrinsicInfo *
+findNeonIntrinsicInMap(llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap,
+                       unsigned BuiltinID, bool &MapProvenSorted) {
+
+#ifndef NDEBUG
+  if (!MapProvenSorted) {
+    // FIXME: use std::is_sorted once C++11 is allowed
+    for (unsigned i = 0; i < IntrinsicMap.size() - 1; ++i)
+      assert(IntrinsicMap[i].BuiltinID <= IntrinsicMap[i + 1].BuiltinID);
+    MapProvenSorted = true;
+  }
+#endif
+
+  const NeonIntrinsicInfo *Builtin =
+      std::lower_bound(IntrinsicMap.begin(), IntrinsicMap.end(), BuiltinID);
+
+  if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
+    return Builtin;
+
+  return 0;
+}
+
+Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
+                                                   unsigned Modifier,
+                                                   llvm::Type *ArgType,
+                                                   const CallExpr *E) {
+  // Return type.
+  SmallVector<llvm::Type *, 3> Tys;
+  if (Modifier & AddRetType) {
+    llvm::Type *Ty = ConvertType(E->getCallReturnType());
+    if (Modifier & VectorizeRetType)
+      Ty = llvm::VectorType::get(Ty, 1);
+
+    Tys.push_back(Ty);
+  }
+
+  // Arguments.
+  if (Modifier & VectorizeArgTypes)
+    ArgType = llvm::VectorType::get(ArgType, 1);
+
+  if (Modifier & (Add1ArgType | Add2ArgTypes))
+    Tys.push_back(ArgType);
+
+  if (Modifier & Add2ArgTypes)
+    Tys.push_back(ArgType);
+
+  if (Modifier & InventFloatType)
+    Tys.push_back(FloatTy);
+
+  return CGM.getIntrinsic(IntrinsicID, Tys);
+}
+
+
+static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF,
+                                           const NeonIntrinsicInfo &SISDInfo,
+                                           const CallExpr *E) {
+  unsigned BuiltinID = SISDInfo.BuiltinID;
+  unsigned int Int = SISDInfo.LLVMIntrinsic;
+  unsigned IntTypes = SISDInfo.TypeModifier;
+  const char *s = SISDInfo.NameHint;
+
+  SmallVector<Value *, 4> Ops;
+  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
+    Ops.push_back(CGF.EmitScalarExpr(E->getArg(i)));
+  }
+
+  // AArch64 scalar builtins are not overloaded, they do not have an extra
+  // argument that specifies the vector type, need to handle each case.
+  switch (BuiltinID) {
+  default: break;
+  case NEON::BI__builtin_neon_vdups_lane_f32:
+  case NEON::BI__builtin_neon_vdupd_lane_f64:
+  case NEON::BI__builtin_neon_vdups_laneq_f32:
+  case NEON::BI__builtin_neon_vdupd_laneq_f64: {
+    return CGF.Builder.CreateExtractElement(Ops[0], Ops[1], "vdup_lane");
+  }
+  case NEON::BI__builtin_neon_vdupb_lane_i8:
+  case NEON::BI__builtin_neon_vduph_lane_i16:
+  case NEON::BI__builtin_neon_vdups_lane_i32:
+  case NEON::BI__builtin_neon_vdupd_lane_i64:
+  case NEON::BI__builtin_neon_vdupb_laneq_i8:
+  case NEON::BI__builtin_neon_vduph_laneq_i16:
+  case NEON::BI__builtin_neon_vdups_laneq_i32:
+  case NEON::BI__builtin_neon_vdupd_laneq_i64: {
+    // The backend treats Neon scalar types as v1ix types
+    // So we want to dup lane from any vector to v1ix vector
+    // with shufflevector
+    s = "vdup_lane";
+    Value* SV = llvm::ConstantVector::getSplat(1, cast<ConstantInt>(Ops[1]));
+    Value *Result = CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], SV, s);
+    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
+    // AArch64 intrinsic one-element vector type cast to
+    // scalar type expected by the builtin
+    return CGF.Builder.CreateBitCast(Result, Ty, s);
+  }
+  case NEON::BI__builtin_neon_vqdmlalh_lane_s16 :
+  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16 :
+  case NEON::BI__builtin_neon_vqdmlals_lane_s32 :
+  case NEON::BI__builtin_neon_vqdmlals_laneq_s32 :
+  case NEON::BI__builtin_neon_vqdmlslh_lane_s16 :
+  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16 :
+  case NEON::BI__builtin_neon_vqdmlsls_lane_s32 :
+  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32 : {
+    Int = Intrinsic::arm_neon_vqadds;
+    if (BuiltinID == NEON::BI__builtin_neon_vqdmlslh_lane_s16 ||
+        BuiltinID == NEON::BI__builtin_neon_vqdmlslh_laneq_s16 ||
+        BuiltinID == NEON::BI__builtin_neon_vqdmlsls_lane_s32 ||
+        BuiltinID == NEON::BI__builtin_neon_vqdmlsls_laneq_s32) {
+      Int = Intrinsic::arm_neon_vqsubs;
+    }
+    // create vqdmull call with b * c[i]
+    llvm::Type *Ty = CGF.ConvertType(E->getArg(1)->getType());
+    llvm::VectorType *OpVTy = llvm::VectorType::get(Ty, 1);
+    Ty = CGF.ConvertType(E->getArg(0)->getType());
+    llvm::VectorType *ResVTy = llvm::VectorType::get(Ty, 1);
+    Value *F = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, ResVTy);
+    Value *V = UndefValue::get(OpVTy);
+    llvm::Constant *CI = ConstantInt::get(CGF.Int32Ty, 0);
+    SmallVector<Value *, 2> MulOps;
+    MulOps.push_back(Ops[1]);
+    MulOps.push_back(Ops[2]);
+    MulOps[0] = CGF.Builder.CreateInsertElement(V, MulOps[0], CI);
+    MulOps[1] = CGF.Builder.CreateExtractElement(MulOps[1], Ops[3], "extract");
+    MulOps[1] = CGF.Builder.CreateInsertElement(V, MulOps[1], CI);
+    Value *MulRes = CGF.Builder.CreateCall2(F, MulOps[0], MulOps[1]);
+    // create vqadds call with a +/- vqdmull result
+    F = CGF.CGM.getIntrinsic(Int, ResVTy);
+    SmallVector<Value *, 2> AddOps;
+    AddOps.push_back(Ops[0]);
+    AddOps.push_back(MulRes);
+    V = UndefValue::get(ResVTy);
+    AddOps[0] = CGF.Builder.CreateInsertElement(V, AddOps[0], CI);
+    Value *AddRes = CGF.Builder.CreateCall2(F, AddOps[0], AddOps[1]);
+    return CGF.Builder.CreateBitCast(AddRes, Ty);
+  }
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
+    Value *F = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
+    return CGF.Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
+  }
+  // Scalar Floating-point Multiply Extended
+  case NEON::BI__builtin_neon_vmulxs_f32:
+  case NEON::BI__builtin_neon_vmulxd_f64: {
+    Int = Intrinsic::aarch64_neon_vmulx;
+    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
+    return CGF.EmitNeonCall(CGF.CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
+  }
+  case NEON::BI__builtin_neon_vmul_n_f64: {
+    // v1f64 vmul_n_f64  should be mapped to Neon scalar mul lane
+    llvm::Type *VTy = GetNeonType(&CGF,
+      NeonTypeFlags(NeonTypeFlags::Float64, false, false));
+    Ops[0] = CGF.Builder.CreateBitCast(Ops[0], VTy);
+    llvm::Value *Idx = llvm::ConstantInt::get(CGF.Int32Ty, 0);
+    Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], Idx, "extract");
+    Value *Result = CGF.Builder.CreateFMul(Ops[0], Ops[1]);
+    return CGF.Builder.CreateBitCast(Result, VTy);
+  }
+  case NEON::BI__builtin_neon_vget_lane_i8:
+  case NEON::BI__builtin_neon_vget_lane_i16:
+  case NEON::BI__builtin_neon_vget_lane_i32:
+  case NEON::BI__builtin_neon_vget_lane_i64:
+  case NEON::BI__builtin_neon_vget_lane_f32:
+  case NEON::BI__builtin_neon_vget_lane_f64:
+  case NEON::BI__builtin_neon_vgetq_lane_i8:
+  case NEON::BI__builtin_neon_vgetq_lane_i16:
+  case NEON::BI__builtin_neon_vgetq_lane_i32:
+  case NEON::BI__builtin_neon_vgetq_lane_i64:
+  case NEON::BI__builtin_neon_vgetq_lane_f32:
+  case NEON::BI__builtin_neon_vgetq_lane_f64:
+    return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vget_lane_i8, E);
+  case NEON::BI__builtin_neon_vset_lane_i8:
+  case NEON::BI__builtin_neon_vset_lane_i16:
+  case NEON::BI__builtin_neon_vset_lane_i32:
+  case NEON::BI__builtin_neon_vset_lane_i64:
+  case NEON::BI__builtin_neon_vset_lane_f32:
+  case NEON::BI__builtin_neon_vset_lane_f64:
+  case NEON::BI__builtin_neon_vsetq_lane_i8:
+  case NEON::BI__builtin_neon_vsetq_lane_i16:
+  case NEON::BI__builtin_neon_vsetq_lane_i32:
+  case NEON::BI__builtin_neon_vsetq_lane_i64:
+  case NEON::BI__builtin_neon_vsetq_lane_f32:
+  case NEON::BI__builtin_neon_vsetq_lane_f64:
+    return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vset_lane_i8, E);
+
+  case NEON::BI__builtin_neon_vcled_s64:
+  case NEON::BI__builtin_neon_vcled_u64:
+  case NEON::BI__builtin_neon_vcles_f32:
+  case NEON::BI__builtin_neon_vcled_f64:
+  case NEON::BI__builtin_neon_vcltd_s64:
+  case NEON::BI__builtin_neon_vcltd_u64:
+  case NEON::BI__builtin_neon_vclts_f32:
+  case NEON::BI__builtin_neon_vcltd_f64:
+  case NEON::BI__builtin_neon_vcales_f32:
+  case NEON::BI__builtin_neon_vcaled_f64:
+  case NEON::BI__builtin_neon_vcalts_f32:
+  case NEON::BI__builtin_neon_vcaltd_f64:
+    // Only one direction of comparisons actually exist, cmle is actually a cmge
+    // with swapped operands. The table gives us the right intrinsic but we
+    // still need to do the swap.
+    std::swap(Ops[0], Ops[1]);
+    break;
+  case NEON::BI__builtin_neon_vceqzd_s64:
+  case NEON::BI__builtin_neon_vceqzd_u64:
+  case NEON::BI__builtin_neon_vcgezd_s64:
+  case NEON::BI__builtin_neon_vcgtzd_s64:
+  case NEON::BI__builtin_neon_vclezd_s64:
+  case NEON::BI__builtin_neon_vcltzd_s64:
+    // Add implicit zero operand.
+    Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
+    break;
+  case NEON::BI__builtin_neon_vceqzs_f32:
+  case NEON::BI__builtin_neon_vceqzd_f64:
+  case NEON::BI__builtin_neon_vcgezs_f32:
+  case NEON::BI__builtin_neon_vcgezd_f64:
+  case NEON::BI__builtin_neon_vcgtzs_f32:
+  case NEON::BI__builtin_neon_vcgtzd_f64:
+  case NEON::BI__builtin_neon_vclezs_f32:
+  case NEON::BI__builtin_neon_vclezd_f64:
+  case NEON::BI__builtin_neon_vcltzs_f32:
+  case NEON::BI__builtin_neon_vcltzd_f64:
+    // Add implicit zero operand.
+    Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
+    break;
+  }
+
+
+  assert(Int && "Generic code assumes a valid intrinsic");
+
+  // Determine the type(s) of this overloaded AArch64 intrinsic.
+  const Expr *Arg = E->getArg(0);
+  llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
+  Function *F = CGF.LookupNeonLLVMIntrinsic(Int, IntTypes, ArgTy, E);
+
+  Value *Result = CGF.EmitNeonCall(F, Ops, s);
+  llvm::Type *ResultType = CGF.ConvertType(E->getType());
+  // AArch64 intrinsic one-element vector type cast to
+  // scalar type expected by the builtin
+  return CGF.Builder.CreateBitCast(Result, ResultType, s);
+}
+
+Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
+    unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
+    const char *NameHint, unsigned Modifier, const CallExpr *E,
+    SmallVectorImpl<llvm::Value *> &Ops, llvm::Value *Align) {
   // Get the last argument, which specifies the vector type.
-  llvm::APSInt Result;
+  llvm::APSInt NeonTypeConst;
   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
-  if (!Arg->isIntegerConstantExpr(Result, getContext()))
+  if (!Arg->isIntegerConstantExpr(NeonTypeConst, getContext()))
     return 0;
 
   // Determine the type of this overloaded NEON intrinsic.
-  NeonTypeFlags Type(Result.getZExtValue());
+  NeonTypeFlags Type(NeonTypeConst.getZExtValue());
   bool Usgn = Type.isUnsigned();
   bool Quad = Type.isQuad();
 
@@ -1774,31 +2614,17 @@ Value *CodeGenFunction::EmitCommonNeonBu
   if (!Ty)
     return 0;
 
-  unsigned Int;
+  unsigned Int = LLVMIntrinsic;
+  if ((Modifier & UnsignedAlts) && !Usgn)
+    Int = AltLLVMIntrinsic;
+
   switch (BuiltinID) {
   default: break;
   case NEON::BI__builtin_neon_vabs_v:
   case NEON::BI__builtin_neon_vabsq_v:
     if (VTy->getElementType()->isFloatingPointTy())
       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vabs, Ty), Ops,
-                        "vabs");
-  case NEON::BI__builtin_neon_vaeseq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aese),
-                        Ops, "aese");
-  case NEON::BI__builtin_neon_vaesdq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesd),
-                        Ops, "aesd");
-  case NEON::BI__builtin_neon_vaesmcq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesmc),
-                        Ops, "aesmc");
-  case NEON::BI__builtin_neon_vaesimcq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesimc),
-                        Ops, "aesimc");
-  case NEON::BI__builtin_neon_vabd_v:
-  case NEON::BI__builtin_neon_vabdq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vabdu : Intrinsic::arm_neon_vabds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
+    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
   case NEON::BI__builtin_neon_vaddhn_v: {
     llvm::VectorType *SrcTy =
         llvm::VectorType::getExtendedElementVectorType(VTy);
@@ -1817,65 +2643,28 @@ Value *CodeGenFunction::EmitCommonNeonBu
     // %res = trunc <4 x i32> %high to <4 x i16>
     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
   }
-  case NEON::BI__builtin_neon_vbsl_v:
-  case NEON::BI__builtin_neon_vbslq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty),
-                        Ops, "vbsl");
   case NEON::BI__builtin_neon_vcale_v:
   case NEON::BI__builtin_neon_vcaleq_v:
-    std::swap(Ops[0], Ops[1]);
-  case NEON::BI__builtin_neon_vcage_v:
-  case NEON::BI__builtin_neon_vcageq_v: {
-    llvm::Type *VecFlt = llvm::VectorType::get(
-        VTy->getScalarSizeInBits() == 32 ? FloatTy : DoubleTy,
-        VTy->getNumElements());
-    llvm::Type *Tys[] = { VTy, VecFlt };
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacge, Tys);
-    return EmitNeonCall(F, Ops, "vcage");
-  }
   case NEON::BI__builtin_neon_vcalt_v:
   case NEON::BI__builtin_neon_vcaltq_v:
     std::swap(Ops[0], Ops[1]);
+  case NEON::BI__builtin_neon_vcage_v:
+  case NEON::BI__builtin_neon_vcageq_v:
   case NEON::BI__builtin_neon_vcagt_v:
   case NEON::BI__builtin_neon_vcagtq_v: {
     llvm::Type *VecFlt = llvm::VectorType::get(
         VTy->getScalarSizeInBits() == 32 ? FloatTy : DoubleTy,
         VTy->getNumElements());
     llvm::Type *Tys[] = { VTy, VecFlt };
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgt, Tys);
-    return EmitNeonCall(F, Ops, "vcagt");
-  }
-  case NEON::BI__builtin_neon_vcls_v:
-  case NEON::BI__builtin_neon_vclsq_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcls, Ty);
-    return EmitNeonCall(F, Ops, "vcls");
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
+    return EmitNeonCall(F, Ops, NameHint);
   }
   case NEON::BI__builtin_neon_vclz_v:
-  case NEON::BI__builtin_neon_vclzq_v: {
-    // Generate target-independent intrinsic; also need to add second argument
+  case NEON::BI__builtin_neon_vclzq_v:
+    // We generate target-independent intrinsic, which needs a second argument
     // for whether or not clz of zero is undefined; on ARM it isn't.
-    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ty);
     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
-    return EmitNeonCall(F, Ops, "vclz");
-  }
-  case NEON::BI__builtin_neon_vcnt_v:
-  case NEON::BI__builtin_neon_vcntq_v: {
-    // generate target-independent intrinsic
-    Function *F = CGM.getIntrinsic(Intrinsic::ctpop, Ty);
-    return EmitNeonCall(F, Ops, "vctpop");
-  }
-  case NEON::BI__builtin_neon_vcvt_f16_v: {
-    assert(Type.getEltType() == NeonTypeFlags::Float16 && !Quad &&
-           "unexpected vcvt_f16_v builtin");
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcvtfp2hf);
-    return EmitNeonCall(F, Ops, "vcvt");
-  }
-  case NEON::BI__builtin_neon_vcvt_f32_f16: {
-    assert(Type.getEltType() == NeonTypeFlags::Float16 && !Quad &&
-           "unexpected vcvt_f32_f16 builtin");
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcvthf2fp);
-    return EmitNeonCall(F, Ops, "vcvt");
-  }
+    break;
   case NEON::BI__builtin_neon_vcvt_f32_v:
   case NEON::BI__builtin_neon_vcvtq_f32_v:
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
@@ -1884,11 +2673,14 @@ Value *CodeGenFunction::EmitCommonNeonBu
                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
   case NEON::BI__builtin_neon_vcvt_n_f32_v:
   case NEON::BI__builtin_neon_vcvtq_n_f32_v: {
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
     llvm::Type *FloatTy =
-      GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad));
+        GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                               : NeonTypeFlags::Float32,
+                                        false, Quad));
     llvm::Type *Tys[2] = { FloatTy, Ty };
-    Int = Usgn ? Intrinsic::arm_neon_vcvtfxu2fp
-               : Intrinsic::arm_neon_vcvtfxs2fp;
+    Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
     Function *F = CGM.getIntrinsic(Int, Tys);
     return EmitNeonCall(F, Ops, "vcvt_n");
   }
@@ -1907,9 +2699,7 @@ Value *CodeGenFunction::EmitCommonNeonBu
                                                : NeonTypeFlags::Float32,
                                         false, Quad));
     llvm::Type *Tys[2] = { Ty, FloatTy };
-    Int = Usgn ? Intrinsic::arm_neon_vcvtfp2fxu
-               : Intrinsic::arm_neon_vcvtfp2fxs;
-    Function *F = CGM.getIntrinsic(Int, Tys);
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
     return EmitNeonCall(F, Ops, "vcvt_n");
   }
   case NEON::BI__builtin_neon_vcvt_s32_v:
@@ -1937,17 +2727,7 @@ Value *CodeGenFunction::EmitCommonNeonBu
   case NEON::BI__builtin_neon_vcvtaq_s32_v:
   case NEON::BI__builtin_neon_vcvtaq_s64_v:
   case NEON::BI__builtin_neon_vcvtaq_u32_v:
-  case NEON::BI__builtin_neon_vcvtaq_u64_v: {
-    Int = Usgn ? Intrinsic::arm_neon_vcvtau : Intrinsic::arm_neon_vcvtas;
-    bool Double =
-      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
-    llvm::Type *InTy =
-      GetNeonType(this,
-                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
-                                : NeonTypeFlags::Float32, false, Quad));
-    llvm::Type *Tys[2] = { Ty, InTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
-  }
+  case NEON::BI__builtin_neon_vcvtaq_u64_v:
   case NEON::BI__builtin_neon_vcvtn_s32_v:
   case NEON::BI__builtin_neon_vcvtn_s64_v:
   case NEON::BI__builtin_neon_vcvtn_u32_v:
@@ -1955,17 +2735,7 @@ Value *CodeGenFunction::EmitCommonNeonBu
   case NEON::BI__builtin_neon_vcvtnq_s32_v:
   case NEON::BI__builtin_neon_vcvtnq_s64_v:
   case NEON::BI__builtin_neon_vcvtnq_u32_v:
-  case NEON::BI__builtin_neon_vcvtnq_u64_v: {
-    Int = Usgn ? Intrinsic::arm_neon_vcvtnu : Intrinsic::arm_neon_vcvtns;
-    bool Double =
-      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
-    llvm::Type *InTy =
-      GetNeonType(this,
-                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
-                                : NeonTypeFlags::Float32, false, Quad));
-    llvm::Type *Tys[2] = { Ty, InTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
-  }
+  case NEON::BI__builtin_neon_vcvtnq_u64_v:
   case NEON::BI__builtin_neon_vcvtp_s32_v:
   case NEON::BI__builtin_neon_vcvtp_s64_v:
   case NEON::BI__builtin_neon_vcvtp_u32_v:
@@ -1973,17 +2743,7 @@ Value *CodeGenFunction::EmitCommonNeonBu
   case NEON::BI__builtin_neon_vcvtpq_s32_v:
   case NEON::BI__builtin_neon_vcvtpq_s64_v:
   case NEON::BI__builtin_neon_vcvtpq_u32_v:
-  case NEON::BI__builtin_neon_vcvtpq_u64_v: {
-    Int = Usgn ? Intrinsic::arm_neon_vcvtpu : Intrinsic::arm_neon_vcvtps;
-    bool Double =
-      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
-    llvm::Type *InTy =
-      GetNeonType(this,
-                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
-                                : NeonTypeFlags::Float32, false, Quad));
-    llvm::Type *Tys[2] = { Ty, InTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
-  }
+  case NEON::BI__builtin_neon_vcvtpq_u64_v:
   case NEON::BI__builtin_neon_vcvtm_s32_v:
   case NEON::BI__builtin_neon_vcvtm_s64_v:
   case NEON::BI__builtin_neon_vcvtm_u32_v:
@@ -1992,7 +2752,6 @@ Value *CodeGenFunction::EmitCommonNeonBu
   case NEON::BI__builtin_neon_vcvtmq_s64_v:
   case NEON::BI__builtin_neon_vcvtmq_u32_v:
   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
-    Int = Usgn ? Intrinsic::arm_neon_vcvtmu : Intrinsic::arm_neon_vcvtms;
     bool Double =
       (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
     llvm::Type *InTy =
@@ -2000,7 +2759,7 @@ Value *CodeGenFunction::EmitCommonNeonBu
                   NeonTypeFlags(Double ? NeonTypeFlags::Float64
                                 : NeonTypeFlags::Float32, false, Quad));
     llvm::Type *Tys[2] = { Ty, InTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
+    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
   }
   case NEON::BI__builtin_neon_vext_v:
   case NEON::BI__builtin_neon_vextq_v: {
@@ -2024,39 +2783,18 @@ Value *CodeGenFunction::EmitCommonNeonBu
     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
     return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
   }
-  case NEON::BI__builtin_neon_vhadd_v:
-  case NEON::BI__builtin_neon_vhaddq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vhaddu : Intrinsic::arm_neon_vhadds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vhadd");
-  case NEON::BI__builtin_neon_vhsub_v:
-  case NEON::BI__builtin_neon_vhsubq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vhsubu : Intrinsic::arm_neon_vhsubs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vhsub");
   case NEON::BI__builtin_neon_vld1_v:
   case NEON::BI__builtin_neon_vld1q_v:
     Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Ty),
-                        Ops, "vld1");
+    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vld1");
   case NEON::BI__builtin_neon_vld2_v:
-  case NEON::BI__builtin_neon_vld2q_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2, Ty);
-    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld2");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
+  case NEON::BI__builtin_neon_vld2q_v:
   case NEON::BI__builtin_neon_vld3_v:
-  case NEON::BI__builtin_neon_vld3q_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3, Ty);
-    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld3");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
+  case NEON::BI__builtin_neon_vld3q_v:
   case NEON::BI__builtin_neon_vld4_v:
   case NEON::BI__builtin_neon_vld4q_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4, Ty);
-    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld4");
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty);
+    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, NameHint);
     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     return Builder.CreateStore(Ops[1], Ops[0]);
@@ -2073,49 +2811,20 @@ Value *CodeGenFunction::EmitCommonNeonBu
     return EmitNeonSplat(Ops[0], CI);
   }
   case NEON::BI__builtin_neon_vld2_lane_v:
-  case NEON::BI__builtin_neon_vld2q_lane_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2lane, Ty);
-    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
-    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
-    Ops.push_back(Align);
-    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld2_lane");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
+  case NEON::BI__builtin_neon_vld2q_lane_v:
   case NEON::BI__builtin_neon_vld3_lane_v:
-  case NEON::BI__builtin_neon_vld3q_lane_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3lane, Ty);
-    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
-    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
-    Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
-    Ops.push_back(Align);
-    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
+  case NEON::BI__builtin_neon_vld3q_lane_v:
   case NEON::BI__builtin_neon_vld4_lane_v:
   case NEON::BI__builtin_neon_vld4q_lane_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4lane, Ty);
-    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
-    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
-    Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
-    Ops[5] = Builder.CreateBitCast(Ops[5], Ty);
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty);
+    for (unsigned I = 2; I < Ops.size() - 1; ++I)
+      Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
     Ops.push_back(Align);
-    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane");
+    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), NameHint);
     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     return Builder.CreateStore(Ops[1], Ops[0]);
   }
-  case NEON::BI__builtin_neon_vmax_v:
-  case NEON::BI__builtin_neon_vmaxq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vmaxu : Intrinsic::arm_neon_vmaxs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
-  case NEON::BI__builtin_neon_vmin_v:
-  case NEON::BI__builtin_neon_vminq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vminu : Intrinsic::arm_neon_vmins;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
   case NEON::BI__builtin_neon_vmovl_v: {
     llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
@@ -2128,11 +2837,6 @@ Value *CodeGenFunction::EmitCommonNeonBu
     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
   }
-  case NEON::BI__builtin_neon_vmul_v:
-  case NEON::BI__builtin_neon_vmulq_v:
-    assert(Type.isPoly() && "vmul builtin only supported for polynomial types");
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vmulp, Ty),
-                        Ops, "vmul");
   case NEON::BI__builtin_neon_vmull_v:
     // FIXME: the integer vmull operations could be emitted in terms of pure
     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
@@ -2144,7 +2848,6 @@ Value *CodeGenFunction::EmitCommonNeonBu
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v: {
-    Int = Usgn ? Intrinsic::arm_neon_vpadalu : Intrinsic::arm_neon_vpadals;
     // The source operand type has twice as many elements of half the size.
     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
     llvm::Type *EltTy =
@@ -2152,14 +2855,10 @@ Value *CodeGenFunction::EmitCommonNeonBu
     llvm::Type *NarrowTy =
       llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
     llvm::Type *Tys[2] = { Ty, NarrowTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpadal");
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
   }
-  case NEON::BI__builtin_neon_vpadd_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vpadd, Ty),
-                        Ops, "vpadd");
   case NEON::BI__builtin_neon_vpaddl_v:
   case NEON::BI__builtin_neon_vpaddlq_v: {
-    Int = Usgn ? Intrinsic::arm_neon_vpaddlu : Intrinsic::arm_neon_vpaddls;
     // The source operand type has twice as many elements of half the size.
     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
@@ -2168,128 +2867,29 @@ Value *CodeGenFunction::EmitCommonNeonBu
     llvm::Type *Tys[2] = { Ty, NarrowTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
   }
-  case NEON::BI__builtin_neon_vpmax_v:
-    Int = Usgn ? Intrinsic::arm_neon_vpmaxu : Intrinsic::arm_neon_vpmaxs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
-  case NEON::BI__builtin_neon_vpmin_v:
-    Int = Usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
-  case NEON::BI__builtin_neon_vqabs_v:
-  case NEON::BI__builtin_neon_vqabsq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqabs, Ty),
-                        Ops, "vqabs");
-  case NEON::BI__builtin_neon_vqadd_v:
-  case NEON::BI__builtin_neon_vqaddq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vqaddu : Intrinsic::arm_neon_vqadds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqadd");
-  case NEON::BI__builtin_neon_vqmovn_v:
-    Int = Usgn ? Intrinsic::arm_neon_vqmovnu : Intrinsic::arm_neon_vqmovns;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqmovn");
-  case NEON::BI__builtin_neon_vqmovun_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqmovnsu, Ty),
-                        Ops, "vqdmull");
-  case NEON::BI__builtin_neon_vqneg_v:
-  case NEON::BI__builtin_neon_vqnegq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqneg, Ty),
-                        Ops, "vqneg");
-  case NEON::BI__builtin_neon_vqsub_v:
-  case NEON::BI__builtin_neon_vqsubq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vqsubu : Intrinsic::arm_neon_vqsubs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqsub");
-  case NEON::BI__builtin_neon_vqdmlal_v: {
+  case NEON::BI__builtin_neon_vqdmlal_v:
+  case NEON::BI__builtin_neon_vqdmlsl_v: {
     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
-    Value *Mul = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty),
+    Value *Mul = EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty),
                               MulOps, "vqdmlal");
 
-    SmallVector<Value *, 2> AddOps;
-    AddOps.push_back(Ops[0]);
-    AddOps.push_back(Mul);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqadds, Ty),
-                        AddOps, "vqdmlal");
+    SmallVector<Value *, 2> AccumOps;
+    AccumOps.push_back(Ops[0]);
+    AccumOps.push_back(Mul);
+    return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty),
+                        AccumOps, NameHint);
   }
-  case NEON::BI__builtin_neon_vqdmlsl_v: {
-    SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
-    Value *Mul = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty),
-                              MulOps, "vqdmlsl");
-
-    SmallVector<Value *, 2> SubOps;
-    SubOps.push_back(Ops[0]);
-    SubOps.push_back(Mul);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqsubs, Ty),
-                        SubOps, "vqdmlsl");
-  }
-  case NEON::BI__builtin_neon_vqdmulh_v:
-  case NEON::BI__builtin_neon_vqdmulhq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmulh, Ty),
-                        Ops, "vqdmulh");
-  case NEON::BI__builtin_neon_vqdmull_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty),
-                        Ops, "vqdmull");
   case NEON::BI__builtin_neon_vqshl_n_v:
   case NEON::BI__builtin_neon_vqshlq_n_v:
-    Int = Usgn ? Intrinsic::arm_neon_vqshiftu : Intrinsic::arm_neon_vqshifts;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
                         1, false);
-  case NEON::BI__builtin_neon_vqrdmulh_v:
-  case NEON::BI__builtin_neon_vqrdmulhq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrdmulh, Ty),
-                        Ops, "vqrdmulh");
-  case NEON::BI__builtin_neon_vqrshl_v:
-  case NEON::BI__builtin_neon_vqrshlq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vqrshiftu : Intrinsic::arm_neon_vqrshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshl");
-  case NEON::BI__builtin_neon_vqshl_v:
-  case NEON::BI__builtin_neon_vqshlq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vqshiftu : Intrinsic::arm_neon_vqshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl");
-  case NEON::BI__builtin_neon_vraddhn_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vraddhn, Ty),
-                        Ops, "vraddhn");
   case NEON::BI__builtin_neon_vrecpe_v:
   case NEON::BI__builtin_neon_vrecpeq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
-                        Ops, "vrecpe");
-  case NEON::BI__builtin_neon_vrecps_v:
-  case NEON::BI__builtin_neon_vrecpsq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecps, Ty),
-                        Ops, "vrecps");
-  case NEON::BI__builtin_neon_vrhadd_v:
-  case NEON::BI__builtin_neon_vrhaddq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vrhaddu : Intrinsic::arm_neon_vrhadds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrhadd");
-  case NEON::BI__builtin_neon_vrshl_v:
-  case NEON::BI__builtin_neon_vrshlq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshl");
   case NEON::BI__builtin_neon_vrsqrte_v:
   case NEON::BI__builtin_neon_vrsqrteq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsqrte, Ty),
-                        Ops, "vrsqrte");
-  case NEON::BI__builtin_neon_vrsqrts_v:
-  case NEON::BI__builtin_neon_vrsqrtsq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsqrts, Ty),
-                        Ops, "vrsqrts");
-  case NEON::BI__builtin_neon_vrsubhn_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsubhn, Ty),
-                        Ops, "vrsubhn");
-  case NEON::BI__builtin_neon_vsha1su1q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1su1),
-                        Ops, "sha1su1");
-  case NEON::BI__builtin_neon_vsha256su0q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256su0),
-                        Ops, "sha256su0");
-  case NEON::BI__builtin_neon_vsha1su0q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1su0),
-                        Ops, "sha1su0");
-  case NEON::BI__builtin_neon_vsha256hq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256h),
-                        Ops, "sha256h");
-  case NEON::BI__builtin_neon_vsha256h2q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256h2),
-                        Ops, "sha256h2");
-  case NEON::BI__builtin_neon_vsha256su1q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256su1),
-                        Ops, "sha256su1");
+    Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
+
   case NEON::BI__builtin_neon_vshl_n_v:
   case NEON::BI__builtin_neon_vshlq_n_v:
     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
@@ -2305,10 +2905,6 @@ Value *CodeGenFunction::EmitCommonNeonBu
     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
   }
-  case NEON::BI__builtin_neon_vshl_v:
-  case NEON::BI__builtin_neon_vshlq_v:
-    Int = Usgn ? Intrinsic::arm_neon_vshiftu : Intrinsic::arm_neon_vshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vshl");
   case NEON::BI__builtin_neon_vshrn_n_v: {
     llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
@@ -2324,39 +2920,20 @@ Value *CodeGenFunction::EmitCommonNeonBu
     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
   case NEON::BI__builtin_neon_vst1_v:
   case NEON::BI__builtin_neon_vst1q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1, Ty),
-                        Ops, "");
   case NEON::BI__builtin_neon_vst2_v:
   case NEON::BI__builtin_neon_vst2q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst2, Ty),
-                        Ops, "");
   case NEON::BI__builtin_neon_vst3_v:
   case NEON::BI__builtin_neon_vst3q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst3, Ty),
-                        Ops, "");
   case NEON::BI__builtin_neon_vst4_v:
   case NEON::BI__builtin_neon_vst4q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst4, Ty),
-                        Ops, "");
   case NEON::BI__builtin_neon_vst2_lane_v:
   case NEON::BI__builtin_neon_vst2q_lane_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst2lane, Ty),
-                        Ops, "");
   case NEON::BI__builtin_neon_vst3_lane_v:
   case NEON::BI__builtin_neon_vst3q_lane_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst3lane, Ty),
-                        Ops, "");
   case NEON::BI__builtin_neon_vst4_lane_v:
   case NEON::BI__builtin_neon_vst4q_lane_v:
     Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst4lane, Ty),
-                        Ops, "");
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "");
   case NEON::BI__builtin_neon_vsubhn_v: {
     llvm::VectorType *SrcTy =
         llvm::VectorType::getExtendedElementVectorType(VTy);
@@ -2445,620 +3022,16 @@ Value *CodeGenFunction::EmitCommonNeonBu
   }
   }
 
-  return 0;
-}
-
-enum {
-  AddRetType = (1 << 0),
-  Add1ArgType = (1 << 1),
-  Add2ArgTypes = (1 << 2),
-
-  VectorizeRetType = (1 << 3),
-  VectorizeArgTypes = (1 << 4),
-
-  InventFloatType = (1 << 5),
-
-  Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
-  VectorRet = AddRetType | VectorizeRetType,
-  VectorRetGetArgs01 =
-      AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
-  FpCmpzModifiers =
-      AddRetType | VectorizeRetType | Add1ArgType | InventFloatType,
-};
-
- struct NeonSISDIntrinsicInfo {
-  unsigned BuiltinID;
-  unsigned LLVMIntrinsic;
-  const char *NameHint;
-  unsigned TypeModifier;
-
-  bool operator<(unsigned RHSBuiltinID) const {
-    return BuiltinID < RHSBuiltinID;
-  }
-};
-
-#define SISDMAP0(NameBase, Type) \
-  { NEON::BI__builtin_neon_ ## NameBase ## _ ## Type, 0, #NameBase, 0 }
-
-#define SISDMAP1(NameBase, Type, LLVMIntrinsic, TypeModifier) \
-  { NEON:: BI__builtin_neon_ ## NameBase ## _ ## Type, \
-    Intrinsic::LLVMIntrinsic, #NameBase, TypeModifier }
-
-static const NeonSISDIntrinsicInfo AArch64SISDIntrinsicInfo[] = {
-  SISDMAP1(vabdd, f64, aarch64_neon_vabd, AddRetType),
-  SISDMAP1(vabds, f32, aarch64_neon_vabd, AddRetType),
-  SISDMAP1(vabsd, s64, aarch64_neon_vabs, 0),
-  SISDMAP1(vaddd, s64, aarch64_neon_vaddds, 0),
-  SISDMAP1(vaddd, u64, aarch64_neon_vadddu, 0),
-  SISDMAP1(vaddlv, s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlv, s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlv, s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlv, u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlv, u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlv, u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlvq, s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlvq, s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlvq, s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlvq, u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlvq, u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddlvq, u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddv, f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
-  SISDMAP1(vaddv, s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddv, s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddv, s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddv, u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddv, u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddv, u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
-  SISDMAP1(vaddvq, f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
-  SISDMAP1(vaddvq, s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, s64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, u64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vaddvq, u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
-  SISDMAP1(vcaged, f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcages, f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcagtd, f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcagts, f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcaled, f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcales, f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcaltd, f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcalts, f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vceqd, f64, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
-  SISDMAP1(vceqd, s64, aarch64_neon_vceq, VectorRetGetArgs01),
-  SISDMAP1(vceqd, u64, aarch64_neon_vceq, VectorRetGetArgs01),
-  SISDMAP1(vceqs, f32, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
-  SISDMAP1(vceqzd, f64, aarch64_neon_fceq, FpCmpzModifiers),
-  SISDMAP1(vceqzd, s64, aarch64_neon_vceq, VectorRetGetArgs01),
-  SISDMAP1(vceqzd, u64, aarch64_neon_vceq, VectorRetGetArgs01),
-  SISDMAP1(vceqzs, f32, aarch64_neon_fceq, FpCmpzModifiers),
-  SISDMAP1(vcged, f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcged, s64, aarch64_neon_vcge, VectorRetGetArgs01),
-  SISDMAP1(vcged, u64, aarch64_neon_vchs, VectorRetGetArgs01),
-  SISDMAP1(vcges, f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcgezd, f64, aarch64_neon_fcge, FpCmpzModifiers),
-  SISDMAP1(vcgezd, s64, aarch64_neon_vcge, VectorRetGetArgs01),
-  SISDMAP1(vcgezs, f32, aarch64_neon_fcge, FpCmpzModifiers),
-  SISDMAP1(vcgtd, f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcgtd, s64, aarch64_neon_vcgt, VectorRetGetArgs01),
-  SISDMAP1(vcgtd, u64, aarch64_neon_vchi, VectorRetGetArgs01),
-  SISDMAP1(vcgts, f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcgtzd, f64, aarch64_neon_fcgt, FpCmpzModifiers),
-  SISDMAP1(vcgtzd, s64, aarch64_neon_vcgt, VectorRetGetArgs01),
-  SISDMAP1(vcgtzs, f32, aarch64_neon_fcgt, FpCmpzModifiers),
-  SISDMAP1(vcled, f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcled, s64, aarch64_neon_vcge, VectorRetGetArgs01),
-  SISDMAP1(vcled, u64, aarch64_neon_vchs, VectorRetGetArgs01),
-  SISDMAP1(vcles, f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
-  SISDMAP1(vclezd, f64, aarch64_neon_fclez, FpCmpzModifiers),
-  SISDMAP1(vclezd, s64, aarch64_neon_vclez, VectorRetGetArgs01),
-  SISDMAP1(vclezs, f32, aarch64_neon_fclez, FpCmpzModifiers),
-  SISDMAP1(vcltd, f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcltd, s64, aarch64_neon_vcgt, VectorRetGetArgs01),
-  SISDMAP1(vcltd, u64, aarch64_neon_vchi, VectorRetGetArgs01),
-  SISDMAP1(vclts, f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
-  SISDMAP1(vcltzd, f64, aarch64_neon_fcltz, FpCmpzModifiers),
-  SISDMAP1(vcltzd, s64, aarch64_neon_vcltz, VectorRetGetArgs01),
-  SISDMAP1(vcltzs, f32, aarch64_neon_fcltz, FpCmpzModifiers),
-  SISDMAP1(vcvtad_s64, f64, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtad_u64, f64, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtas_s32, f32, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtas_u32, f32, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtd_f64, s64, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvtd_f64, u64, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvtd_n_f64, s64, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvtd_n_f64, u64, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvtd_n_s64, f64, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtd_n_u64, f64, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtd_s64, f64, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtd_u64, f64, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtmd_s64, f64, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtmd_u64, f64, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtms_s32, f32, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtms_u32, f32, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtnd_s64, f64, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtnd_u64, f64, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtns_s32, f32, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtns_u32, f32, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtpd_s64, f64, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtpd_u64, f64, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtps_s32, f32, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtps_u32, f32, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvts_f32, s32, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvts_f32, u32, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvts_n_f32, s32, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvts_n_f32, u32, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
-  SISDMAP1(vcvts_n_s32, f32, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
-  SISDMAP1(vcvts_n_u32, f32, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
-  SISDMAP1(vcvts_s32, f32, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
-  SISDMAP1(vcvts_u32, f32, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
-  SISDMAP1(vcvtxd_f32, f64, aarch64_neon_fcvtxn, 0),
-  SISDMAP0(vdupb_lane, i8),
-  SISDMAP0(vdupb_laneq, i8),
-  SISDMAP0(vdupd_lane, f64),
-  SISDMAP0(vdupd_lane, i64),
-  SISDMAP0(vdupd_laneq, f64),
-  SISDMAP0(vdupd_laneq, i64),
-  SISDMAP0(vduph_lane, i16),
-  SISDMAP0(vduph_laneq, i16),
-  SISDMAP0(vdups_lane, f32),
-  SISDMAP0(vdups_lane, i32),
-  SISDMAP0(vdups_laneq, f32),
-  SISDMAP0(vdups_laneq, i32),
-  SISDMAP0(vfmad_lane, f64),
-  SISDMAP0(vfmad_laneq, f64),
-  SISDMAP0(vfmas_lane, f32),
-  SISDMAP0(vfmas_laneq, f32),
-  SISDMAP0(vget_lane, f32),
-  SISDMAP0(vget_lane, f64),
-  SISDMAP0(vget_lane, i16),
-  SISDMAP0(vget_lane, i32),
-  SISDMAP0(vget_lane, i64),
-  SISDMAP0(vget_lane, i8),
-  SISDMAP0(vgetq_lane, f32),
-  SISDMAP0(vgetq_lane, f64),
-  SISDMAP0(vgetq_lane, i16),
-  SISDMAP0(vgetq_lane, i32),
-  SISDMAP0(vgetq_lane, i64),
-  SISDMAP0(vgetq_lane, i8),
-  SISDMAP1(vmaxnmv, f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
-  SISDMAP1(vmaxnmvq, f32, aarch64_neon_vmaxnmv, 0),
-  SISDMAP1(vmaxnmvq, f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
-  SISDMAP1(vmaxv, f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
-  SISDMAP1(vmaxv, s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxv, s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxv, s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxv, u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxv, u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxv, u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxvq, f32, aarch64_neon_vmaxv, 0),
-  SISDMAP1(vmaxvq, f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
-  SISDMAP1(vmaxvq, s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxvq, s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxvq, s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxvq, u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxvq, u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vmaxvq, u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
-  SISDMAP1(vminnmv, f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
-  SISDMAP1(vminnmvq, f32, aarch64_neon_vminnmv, 0),
-  SISDMAP1(vminnmvq, f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
-  SISDMAP1(vminv, f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
-  SISDMAP1(vminv, s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminv, s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminv, s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminv, u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminv, u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminv, u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminvq, f32, aarch64_neon_vminv, 0),
-  SISDMAP1(vminvq, f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
-  SISDMAP1(vminvq, s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminvq, s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminvq, s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminvq, u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminvq, u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
-  SISDMAP1(vminvq, u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
-  SISDMAP0(vmul_n, f64),
-  SISDMAP1(vmull, p64, aarch64_neon_vmull_p64, 0),
-  SISDMAP0(vmulxd, f64),
-  SISDMAP0(vmulxs, f32),
-  SISDMAP1(vnegd, s64, aarch64_neon_vneg, 0),
-  SISDMAP1(vpaddd, f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
-  SISDMAP1(vpaddd, s64, aarch64_neon_vpadd, 0),
-  SISDMAP1(vpaddd, u64, aarch64_neon_vpadd, 0),
-  SISDMAP1(vpadds, f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
-  SISDMAP1(vpmaxnmqd, f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
-  SISDMAP1(vpmaxnms, f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
-  SISDMAP1(vpmaxqd, f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
-  SISDMAP1(vpmaxs, f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
-  SISDMAP1(vpminnmqd, f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
-  SISDMAP1(vpminnms, f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
-  SISDMAP1(vpminqd, f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
-  SISDMAP1(vpmins, f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
-  SISDMAP1(vqabsb, s8, arm_neon_vqabs, VectorRet),
-  SISDMAP1(vqabsd, s64, arm_neon_vqabs, VectorRet),
-  SISDMAP1(vqabsh, s16, arm_neon_vqabs, VectorRet),
-  SISDMAP1(vqabss, s32, arm_neon_vqabs, VectorRet),
-  SISDMAP1(vqaddb, s8, arm_neon_vqadds, VectorRet),
-  SISDMAP1(vqaddb, u8, arm_neon_vqaddu, VectorRet),
-  SISDMAP1(vqaddd, s64, arm_neon_vqadds, VectorRet),
-  SISDMAP1(vqaddd, u64, arm_neon_vqaddu, VectorRet),
-  SISDMAP1(vqaddh, s16, arm_neon_vqadds, VectorRet),
-  SISDMAP1(vqaddh, u16, arm_neon_vqaddu, VectorRet),
-  SISDMAP1(vqadds, s32, arm_neon_vqadds, VectorRet),
-  SISDMAP1(vqadds, u32, arm_neon_vqaddu, VectorRet),
-  SISDMAP0(vqdmlalh_lane, s16),
-  SISDMAP0(vqdmlalh_laneq, s16),
-  SISDMAP1(vqdmlalh, s16, aarch64_neon_vqdmlal, VectorRet),
-  SISDMAP0(vqdmlals_lane, s32),
-  SISDMAP0(vqdmlals_laneq, s32),
-  SISDMAP1(vqdmlals, s32, aarch64_neon_vqdmlal, VectorRet),
-  SISDMAP0(vqdmlslh_lane, s16),
-  SISDMAP0(vqdmlslh_laneq, s16),
-  SISDMAP1(vqdmlslh, s16, aarch64_neon_vqdmlsl, VectorRet),
-  SISDMAP0(vqdmlsls_lane, s32),
-  SISDMAP0(vqdmlsls_laneq, s32),
-  SISDMAP1(vqdmlsls, s32, aarch64_neon_vqdmlsl, VectorRet),
-  SISDMAP1(vqdmulhh, s16, arm_neon_vqdmulh, VectorRet),
-  SISDMAP1(vqdmulhs, s32, arm_neon_vqdmulh, VectorRet),
-  SISDMAP1(vqdmullh, s16, arm_neon_vqdmull, VectorRet),
-  SISDMAP1(vqdmulls, s32, arm_neon_vqdmull, VectorRet),
-  SISDMAP1(vqmovnd, s64, arm_neon_vqmovns, VectorRet),
-  SISDMAP1(vqmovnd, u64, arm_neon_vqmovnu, VectorRet),
-  SISDMAP1(vqmovnh, s16, arm_neon_vqmovns, VectorRet),
-  SISDMAP1(vqmovnh, u16, arm_neon_vqmovnu, VectorRet),
-  SISDMAP1(vqmovns, s32, arm_neon_vqmovns, VectorRet),
-  SISDMAP1(vqmovns, u32, arm_neon_vqmovnu, VectorRet),
-  SISDMAP1(vqmovund, s64, arm_neon_vqmovnsu, VectorRet),
-  SISDMAP1(vqmovunh, s16, arm_neon_vqmovnsu, VectorRet),
-  SISDMAP1(vqmovuns, s32, arm_neon_vqmovnsu, VectorRet),
-  SISDMAP1(vqnegb, s8, arm_neon_vqneg, VectorRet),
-  SISDMAP1(vqnegd, s64, arm_neon_vqneg, VectorRet),
-  SISDMAP1(vqnegh, s16, arm_neon_vqneg, VectorRet),
-  SISDMAP1(vqnegs, s32, arm_neon_vqneg, VectorRet),
-  SISDMAP1(vqrdmulhh, s16, arm_neon_vqrdmulh, VectorRet),
-  SISDMAP1(vqrdmulhs, s32, arm_neon_vqrdmulh, VectorRet),
-  SISDMAP1(vqrshlb, s8, aarch64_neon_vqrshls, VectorRet),
-  SISDMAP1(vqrshlb, u8, aarch64_neon_vqrshlu, VectorRet),
-  SISDMAP1(vqrshld, s64, aarch64_neon_vqrshls, VectorRet),
-  SISDMAP1(vqrshld, u64, aarch64_neon_vqrshlu, VectorRet),
-  SISDMAP1(vqrshlh, s16, aarch64_neon_vqrshls, VectorRet),
-  SISDMAP1(vqrshlh, u16, aarch64_neon_vqrshlu, VectorRet),
-  SISDMAP1(vqrshls, s32, aarch64_neon_vqrshls, VectorRet),
-  SISDMAP1(vqrshls, u32, aarch64_neon_vqrshlu, VectorRet),
-  SISDMAP1(vqrshrnd_n, s64, aarch64_neon_vsqrshrn, VectorRet),
-  SISDMAP1(vqrshrnd_n, u64, aarch64_neon_vuqrshrn, VectorRet),
-  SISDMAP1(vqrshrnh_n, s16, aarch64_neon_vsqrshrn, VectorRet),
-  SISDMAP1(vqrshrnh_n, u16, aarch64_neon_vuqrshrn, VectorRet),
-  SISDMAP1(vqrshrns_n, s32, aarch64_neon_vsqrshrn, VectorRet),
-  SISDMAP1(vqrshrns_n, u32, aarch64_neon_vuqrshrn, VectorRet),
-  SISDMAP1(vqrshrund_n, s64, aarch64_neon_vsqrshrun, VectorRet),
-  SISDMAP1(vqrshrunh_n, s16, aarch64_neon_vsqrshrun, VectorRet),
-  SISDMAP1(vqrshruns_n, s32, aarch64_neon_vsqrshrun, VectorRet),
-  SISDMAP1(vqshlb_n, s8, aarch64_neon_vqshls_n, VectorRet),
-  SISDMAP1(vqshlb_n, u8, aarch64_neon_vqshlu_n, VectorRet),
-  SISDMAP1(vqshlb, s8, aarch64_neon_vqshls, VectorRet),
-  SISDMAP1(vqshlb, u8, aarch64_neon_vqshlu, VectorRet),
-  SISDMAP1(vqshld_n, s64, aarch64_neon_vqshls_n, VectorRet),
-  SISDMAP1(vqshld_n, u64, aarch64_neon_vqshlu_n, VectorRet),
-  SISDMAP1(vqshld, s64, aarch64_neon_vqshls, VectorRet),
-  SISDMAP1(vqshld, u64, aarch64_neon_vqshlu, VectorRet),
-  SISDMAP1(vqshlh_n, s16, aarch64_neon_vqshls_n, VectorRet),
-  SISDMAP1(vqshlh_n, u16, aarch64_neon_vqshlu_n, VectorRet),
-  SISDMAP1(vqshlh, s16, aarch64_neon_vqshls, VectorRet),
-  SISDMAP1(vqshlh, u16, aarch64_neon_vqshlu, VectorRet),
-  SISDMAP1(vqshls_n, s32, aarch64_neon_vqshls_n, VectorRet),
-  SISDMAP1(vqshls_n, u32, aarch64_neon_vqshlu_n, VectorRet),
-  SISDMAP1(vqshls, s32, aarch64_neon_vqshls, VectorRet),
-  SISDMAP1(vqshls, u32, aarch64_neon_vqshlu, VectorRet),
-  SISDMAP1(vqshlub_n, s8, aarch64_neon_vsqshlu, VectorRet),
-  SISDMAP1(vqshlud_n, s64, aarch64_neon_vsqshlu, VectorRet),
-  SISDMAP1(vqshluh_n, s16, aarch64_neon_vsqshlu, VectorRet),
-  SISDMAP1(vqshlus_n, s32, aarch64_neon_vsqshlu, VectorRet),
-  SISDMAP1(vqshrnd_n, s64, aarch64_neon_vsqshrn, VectorRet),
-  SISDMAP1(vqshrnd_n, u64, aarch64_neon_vuqshrn, VectorRet),
-  SISDMAP1(vqshrnh_n, s16, aarch64_neon_vsqshrn, VectorRet),
-  SISDMAP1(vqshrnh_n, u16, aarch64_neon_vuqshrn, VectorRet),
-  SISDMAP1(vqshrns_n, s32, aarch64_neon_vsqshrn, VectorRet),
-  SISDMAP1(vqshrns_n, u32, aarch64_neon_vuqshrn, VectorRet),
-  SISDMAP1(vqshrund_n, s64, aarch64_neon_vsqshrun, VectorRet),
-  SISDMAP1(vqshrunh_n, s16, aarch64_neon_vsqshrun, VectorRet),
-  SISDMAP1(vqshruns_n, s32, aarch64_neon_vsqshrun, VectorRet),
-  SISDMAP1(vqsubb, s8, arm_neon_vqsubs, VectorRet),
-  SISDMAP1(vqsubb, u8, arm_neon_vqsubu, VectorRet),
-  SISDMAP1(vqsubd, s64, arm_neon_vqsubs, VectorRet),
-  SISDMAP1(vqsubd, u64, arm_neon_vqsubu, VectorRet),
-  SISDMAP1(vqsubh, s16, arm_neon_vqsubs, VectorRet),
-  SISDMAP1(vqsubh, u16, arm_neon_vqsubu, VectorRet),
-  SISDMAP1(vqsubs, s32, arm_neon_vqsubs, VectorRet),
-  SISDMAP1(vqsubs, u32, arm_neon_vqsubu, VectorRet),
-  SISDMAP1(vrecped, f64, aarch64_neon_vrecpe, AddRetType),
-  SISDMAP1(vrecpes, f32, aarch64_neon_vrecpe, AddRetType),
-  SISDMAP1(vrecpsd, f64, aarch64_neon_vrecps, AddRetType),
-  SISDMAP1(vrecpss, f32, aarch64_neon_vrecps, AddRetType),
-  SISDMAP1(vrecpxd, f64, aarch64_neon_vrecpx, AddRetType),
-  SISDMAP1(vrecpxs, f32, aarch64_neon_vrecpx, AddRetType),
-  SISDMAP1(vrshld, s64, aarch64_neon_vrshlds, 0),
-  SISDMAP1(vrshld, u64, aarch64_neon_vrshldu, 0),
-  SISDMAP1(vrshrd_n, s64, aarch64_neon_vsrshr, VectorRet),
-  SISDMAP1(vrshrd_n, u64, aarch64_neon_vurshr, VectorRet),
-  SISDMAP1(vrsqrted, f64, aarch64_neon_vrsqrte, AddRetType),
-  SISDMAP1(vrsqrtes, f32, aarch64_neon_vrsqrte, AddRetType),
-  SISDMAP1(vrsqrtsd, f64, aarch64_neon_vrsqrts, AddRetType),
-  SISDMAP1(vrsqrtss, f32, aarch64_neon_vrsqrts, AddRetType),
-  SISDMAP1(vrsrad_n, s64, aarch64_neon_vrsrads_n, 0),
-  SISDMAP1(vrsrad_n, u64, aarch64_neon_vrsradu_n, 0),
-  SISDMAP0(vset_lane, f32),
-  SISDMAP0(vset_lane, f64),
-  SISDMAP0(vset_lane, i16),
-  SISDMAP0(vset_lane, i32),
-  SISDMAP0(vset_lane, i64),
-  SISDMAP0(vset_lane, i8),
-  SISDMAP0(vsetq_lane, f32),
-  SISDMAP0(vsetq_lane, f64),
-  SISDMAP0(vsetq_lane, i16),
-  SISDMAP0(vsetq_lane, i32),
-  SISDMAP0(vsetq_lane, i64),
-  SISDMAP0(vsetq_lane, i8),
-  SISDMAP1(vsha1cq, u32, arm_neon_sha1c, 0),
-  SISDMAP1(vsha1h, u32, arm_neon_sha1h, 0),
-  SISDMAP1(vsha1mq, u32, arm_neon_sha1m, 0),
-  SISDMAP1(vsha1pq, u32, arm_neon_sha1p, 0),
-  SISDMAP1(vshld_n, s64, aarch64_neon_vshld_n, 0),
-  SISDMAP1(vshld_n, u64, aarch64_neon_vshld_n, 0),
-  SISDMAP1(vshld, s64, aarch64_neon_vshlds, 0),
-  SISDMAP1(vshld, u64, aarch64_neon_vshldu, 0),
-  SISDMAP1(vshrd_n, s64, aarch64_neon_vshrds_n, 0),
-  SISDMAP1(vshrd_n, u64, aarch64_neon_vshrdu_n, 0),
-  SISDMAP1(vslid_n, s64, aarch64_neon_vsli, VectorRet),
-  SISDMAP1(vslid_n, u64, aarch64_neon_vsli, VectorRet),
-  SISDMAP1(vsqaddb, u8, aarch64_neon_vsqadd, VectorRet),
-  SISDMAP1(vsqaddd, u64, aarch64_neon_vsqadd, VectorRet),
-  SISDMAP1(vsqaddh, u16, aarch64_neon_vsqadd, VectorRet),
-  SISDMAP1(vsqadds, u32, aarch64_neon_vsqadd, VectorRet),
-  SISDMAP1(vsrad_n, s64, aarch64_neon_vsrads_n, 0),
-  SISDMAP1(vsrad_n, u64, aarch64_neon_vsradu_n, 0),
-  SISDMAP1(vsrid_n, s64, aarch64_neon_vsri, VectorRet),
-  SISDMAP1(vsrid_n, u64, aarch64_neon_vsri, VectorRet),
-  SISDMAP1(vsubd, s64, aarch64_neon_vsubds, 0),
-  SISDMAP1(vsubd, u64, aarch64_neon_vsubdu, 0),
-  SISDMAP1(vtstd, s64, aarch64_neon_vtstd, VectorRetGetArgs01),
-  SISDMAP1(vtstd, u64, aarch64_neon_vtstd, VectorRetGetArgs01),
-  SISDMAP1(vuqaddb, s8, aarch64_neon_vuqadd, VectorRet),
-  SISDMAP1(vuqaddd, s64, aarch64_neon_vuqadd, VectorRet),
-  SISDMAP1(vuqaddh, s16, aarch64_neon_vuqadd, VectorRet),
-  SISDMAP1(vuqadds, s32, aarch64_neon_vuqadd, VectorRet),
-};
-
-#undef SISDMAP0
-#undef SISDMAP1
-
-#ifndef NDEBUG
-static bool AArch64SISDIntrinsicInfoProvenSorted = false;
-#endif
-
-Function *CodeGenFunction::LookupNeonIntrinsic(unsigned IntrinsicID,
-                                               unsigned Modifier,
-                                               llvm::Type *ArgType,
-                                               const CallExpr *E) {
-  // Return type.
-  SmallVector<llvm::Type *, 3> Tys;
-  if (Modifier & AddRetType) {
-    llvm::Type *Ty = ConvertType(E->getCallReturnType());
-    if (Modifier & VectorizeRetType)
-      Ty = llvm::VectorType::get(Ty, 1);
-
-    Tys.push_back(Ty);
-  }
-
-  // Arguments.
-  if (Modifier & VectorizeArgTypes)
-    ArgType = llvm::VectorType::get(ArgType, 1);
-
-  if (Modifier & (Add1ArgType | Add2ArgTypes))
-    Tys.push_back(ArgType);
-
-  if (Modifier & Add2ArgTypes)
-    Tys.push_back(ArgType);
-
-  if (Modifier & InventFloatType)
-    Tys.push_back(FloatTy);
-
-  return CGM.getIntrinsic(IntrinsicID, Tys);
-}
-
-static Value *
-EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF,
-                             const NeonSISDIntrinsicInfo &SISDInfo,
-                             const CallExpr *E) {
-  unsigned BuiltinID = SISDInfo.BuiltinID;
-  unsigned int Int = SISDInfo.LLVMIntrinsic;
-  unsigned IntTypes = SISDInfo.TypeModifier;
-  const char *s = SISDInfo.NameHint;
-
-  SmallVector<Value *, 4> Ops;
-  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    Ops.push_back(CGF.EmitScalarExpr(E->getArg(i)));
-  }
-
-  // AArch64 scalar builtins are not overloaded, they do not have an extra
-  // argument that specifies the vector type, need to handle each case.
-  switch (BuiltinID) {
-  default: break;
-  case NEON::BI__builtin_neon_vdups_lane_f32:
-  case NEON::BI__builtin_neon_vdupd_lane_f64:
-  case NEON::BI__builtin_neon_vdups_laneq_f32:
-  case NEON::BI__builtin_neon_vdupd_laneq_f64: {
-    return CGF.Builder.CreateExtractElement(Ops[0], Ops[1], "vdup_lane");
-  }
-  case NEON::BI__builtin_neon_vdupb_lane_i8:
-  case NEON::BI__builtin_neon_vduph_lane_i16:
-  case NEON::BI__builtin_neon_vdups_lane_i32:
-  case NEON::BI__builtin_neon_vdupd_lane_i64:
-  case NEON::BI__builtin_neon_vdupb_laneq_i8:
-  case NEON::BI__builtin_neon_vduph_laneq_i16:
-  case NEON::BI__builtin_neon_vdups_laneq_i32:
-  case NEON::BI__builtin_neon_vdupd_laneq_i64: {
-    // The backend treats Neon scalar types as v1ix types
-    // So we want to dup lane from any vector to v1ix vector
-    // with shufflevector
-    s = "vdup_lane";
-    Value* SV = llvm::ConstantVector::getSplat(1, cast<ConstantInt>(Ops[1]));
-    Value *Result = CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], SV, s);
-    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
-    // AArch64 intrinsic one-element vector type cast to
-    // scalar type expected by the builtin
-    return CGF.Builder.CreateBitCast(Result, Ty, s);
-  }
-  case NEON::BI__builtin_neon_vqdmlalh_lane_s16 :
-  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16 :
-  case NEON::BI__builtin_neon_vqdmlals_lane_s32 :
-  case NEON::BI__builtin_neon_vqdmlals_laneq_s32 :
-  case NEON::BI__builtin_neon_vqdmlslh_lane_s16 :
-  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16 :
-  case NEON::BI__builtin_neon_vqdmlsls_lane_s32 :
-  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32 : {
-    Int = Intrinsic::arm_neon_vqadds;
-    if (BuiltinID == NEON::BI__builtin_neon_vqdmlslh_lane_s16 ||
-        BuiltinID == NEON::BI__builtin_neon_vqdmlslh_laneq_s16 ||
-        BuiltinID == NEON::BI__builtin_neon_vqdmlsls_lane_s32 ||
-        BuiltinID == NEON::BI__builtin_neon_vqdmlsls_laneq_s32) {
-      Int = Intrinsic::arm_neon_vqsubs;
-    }
-    // create vqdmull call with b * c[i]
-    llvm::Type *Ty = CGF.ConvertType(E->getArg(1)->getType());
-    llvm::VectorType *OpVTy = llvm::VectorType::get(Ty, 1);
-    Ty = CGF.ConvertType(E->getArg(0)->getType());
-    llvm::VectorType *ResVTy = llvm::VectorType::get(Ty, 1);
-    Value *F = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, ResVTy);
-    Value *V = UndefValue::get(OpVTy);
-    llvm::Constant *CI = ConstantInt::get(CGF.Int32Ty, 0);
-    SmallVector<Value *, 2> MulOps;
-    MulOps.push_back(Ops[1]);
-    MulOps.push_back(Ops[2]);
-    MulOps[0] = CGF.Builder.CreateInsertElement(V, MulOps[0], CI);
-    MulOps[1] = CGF.Builder.CreateExtractElement(MulOps[1], Ops[3], "extract");
-    MulOps[1] = CGF.Builder.CreateInsertElement(V, MulOps[1], CI);
-    Value *MulRes = CGF.Builder.CreateCall2(F, MulOps[0], MulOps[1]);
-    // create vqadds call with a +/- vqdmull result
-    F = CGF.CGM.getIntrinsic(Int, ResVTy);
-    SmallVector<Value *, 2> AddOps;
-    AddOps.push_back(Ops[0]);
-    AddOps.push_back(MulRes);
-    V = UndefValue::get(ResVTy);
-    AddOps[0] = CGF.Builder.CreateInsertElement(V, AddOps[0], CI);
-    Value *AddRes = CGF.Builder.CreateCall2(F, AddOps[0], AddOps[1]);
-    return CGF.Builder.CreateBitCast(AddRes, Ty);
-  }
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
-  case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
-    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
-    Value *F = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
-    Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
-    return CGF.Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
-  }
-  // Scalar Floating-point Multiply Extended
-  case NEON::BI__builtin_neon_vmulxs_f32:
-  case NEON::BI__builtin_neon_vmulxd_f64: {
-    Int = Intrinsic::aarch64_neon_vmulx;
-    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
-    return CGF.EmitNeonCall(CGF.CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
-  }
-  case NEON::BI__builtin_neon_vmul_n_f64: {
-    // v1f64 vmul_n_f64  should be mapped to Neon scalar mul lane
-    llvm::Type *VTy = GetNeonType(&CGF,
-      NeonTypeFlags(NeonTypeFlags::Float64, false, false));
-    Ops[0] = CGF.Builder.CreateBitCast(Ops[0], VTy);
-    llvm::Value *Idx = llvm::ConstantInt::get(CGF.Int32Ty, 0);
-    Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], Idx, "extract");
-    Value *Result = CGF.Builder.CreateFMul(Ops[0], Ops[1]);
-    return CGF.Builder.CreateBitCast(Result, VTy);
-  }
-  case NEON::BI__builtin_neon_vget_lane_i8:
-  case NEON::BI__builtin_neon_vget_lane_i16:
-  case NEON::BI__builtin_neon_vget_lane_i32:
-  case NEON::BI__builtin_neon_vget_lane_i64:
-  case NEON::BI__builtin_neon_vget_lane_f32:
-  case NEON::BI__builtin_neon_vget_lane_f64:
-  case NEON::BI__builtin_neon_vgetq_lane_i8:
-  case NEON::BI__builtin_neon_vgetq_lane_i16:
-  case NEON::BI__builtin_neon_vgetq_lane_i32:
-  case NEON::BI__builtin_neon_vgetq_lane_i64:
-  case NEON::BI__builtin_neon_vgetq_lane_f32:
-  case NEON::BI__builtin_neon_vgetq_lane_f64:
-    return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vget_lane_i8, E);
-  case NEON::BI__builtin_neon_vset_lane_i8:
-  case NEON::BI__builtin_neon_vset_lane_i16:
-  case NEON::BI__builtin_neon_vset_lane_i32:
-  case NEON::BI__builtin_neon_vset_lane_i64:
-  case NEON::BI__builtin_neon_vset_lane_f32:
-  case NEON::BI__builtin_neon_vset_lane_f64:
-  case NEON::BI__builtin_neon_vsetq_lane_i8:
-  case NEON::BI__builtin_neon_vsetq_lane_i16:
-  case NEON::BI__builtin_neon_vsetq_lane_i32:
-  case NEON::BI__builtin_neon_vsetq_lane_i64:
-  case NEON::BI__builtin_neon_vsetq_lane_f32:
-  case NEON::BI__builtin_neon_vsetq_lane_f64:
-    return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vset_lane_i8, E);
-
-  case NEON::BI__builtin_neon_vcled_s64:
-  case NEON::BI__builtin_neon_vcled_u64:
-  case NEON::BI__builtin_neon_vcles_f32:
-  case NEON::BI__builtin_neon_vcled_f64:
-  case NEON::BI__builtin_neon_vcltd_s64:
-  case NEON::BI__builtin_neon_vcltd_u64:
-  case NEON::BI__builtin_neon_vclts_f32:
-  case NEON::BI__builtin_neon_vcltd_f64:
-  case NEON::BI__builtin_neon_vcales_f32:
-  case NEON::BI__builtin_neon_vcaled_f64:
-  case NEON::BI__builtin_neon_vcalts_f32:
-  case NEON::BI__builtin_neon_vcaltd_f64:
-    // Only one direction of comparisons actually exist, cmle is actually a cmge
-    // with swapped operands. The table gives us the right intrinsic but we
-    // still need to do the swap.
-    std::swap(Ops[0], Ops[1]);
-    break;
-  case NEON::BI__builtin_neon_vceqzd_s64:
-  case NEON::BI__builtin_neon_vceqzd_u64:
-  case NEON::BI__builtin_neon_vcgezd_s64:
-  case NEON::BI__builtin_neon_vcgtzd_s64:
-  case NEON::BI__builtin_neon_vclezd_s64:
-  case NEON::BI__builtin_neon_vcltzd_s64:
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
-    break;
-  case NEON::BI__builtin_neon_vceqzs_f32:
-  case NEON::BI__builtin_neon_vceqzd_f64:
-  case NEON::BI__builtin_neon_vcgezs_f32:
-  case NEON::BI__builtin_neon_vcgezd_f64:
-  case NEON::BI__builtin_neon_vcgtzs_f32:
-  case NEON::BI__builtin_neon_vcgtzd_f64:
-  case NEON::BI__builtin_neon_vclezs_f32:
-  case NEON::BI__builtin_neon_vclezd_f64:
-  case NEON::BI__builtin_neon_vcltzs_f32:
-  case NEON::BI__builtin_neon_vcltzd_f64:
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
-    break;
-  }
-
-  if (!Int)
-    return 0;
+  assert(Int && "Expected valid intrinsic number");
 
   // Determine the type(s) of this overloaded AArch64 intrinsic.
-  const Expr *Arg = E->getArg(0);
-  llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
-  Function *F = CGF.LookupNeonIntrinsic(Int, IntTypes, ArgTy, E);
+  Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
 
-  Value *Result = CGF.EmitNeonCall(F, Ops, s);
-  llvm::Type *ResultType = CGF.ConvertType(E->getType());
+  Value *Result = EmitNeonCall(F, Ops, NameHint);
+  llvm::Type *ResultType = ConvertType(E->getType());
   // AArch64 intrinsic one-element vector type cast to
   // scalar type expected by the builtin
-  return CGF.Builder.CreateBitCast(Result, ResultType, s);
+  return Builder.CreateBitCast(Result, ResultType, NameHint);
 }
 
 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
@@ -3304,21 +3277,12 @@ Value *CodeGenFunction::EmitAArch64Built
                                                const CallExpr *E) {
 
   // Process AArch64 scalar builtins
-  llvm::ArrayRef<NeonSISDIntrinsicInfo> SISDInfo(AArch64SISDIntrinsicInfo);
-#ifndef NDEBUG
-  if (!AArch64SISDIntrinsicInfoProvenSorted) {
-    // FIXME: use std::is_sorted once C++11 is allowed
-    for (unsigned i = 0; i < SISDInfo.size() - 1; ++i)
-      assert(SISDInfo[i].BuiltinID <= SISDInfo[i + 1].BuiltinID);
-    AArch64SISDIntrinsicInfoProvenSorted = true;
-  }
-#endif
-
-  const NeonSISDIntrinsicInfo *SISDIntrin =
-      std::lower_bound(SISDInfo.begin(), SISDInfo.end(), BuiltinID);
+  llvm::ArrayRef<NeonIntrinsicInfo> SISDInfo(AArch64SISDIntrinsicInfo);
+  const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
+      SISDInfo, BuiltinID, AArch64SISDIntrinsicInfoProvenSorted);
 
-  if (SISDIntrin != SISDInfo.end() && SISDIntrin->BuiltinID == BuiltinID) {
-    Value *Result = EmitAArch64ScalarBuiltinExpr(*this, *SISDIntrin, E);
+  if (Builtin) {
+    Value *Result = EmitAArch64ScalarBuiltinExpr(*this, *Builtin, E);
     assert(Result && "SISD intrinsic should have been handled");
     return Result;
   }
@@ -3461,10 +3425,16 @@ Value *CodeGenFunction::EmitAArch64Built
   if (!Ty)
     return 0;
 
+
   // Many NEON builtins have identical semantics and uses in ARM and
   // AArch64. Emit these in a single function.
-  if (Value *Result = EmitCommonNeonBuiltinExpr(BuiltinID, E, Ops, Align))
-    return Result;
+  llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap(ARMSIMDIntrinsicMap);
+  Builtin = findNeonIntrinsicInMap(IntrinsicMap, BuiltinID,
+                                   NEONSIMDIntrinsicsProvenSorted);
+  if (Builtin)
+    return EmitCommonNeonBuiltinExpr(
+        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
+        Builtin->NameHint, Builtin->TypeModifier, E, Ops, Align);
 
   unsigned Int;
   switch (BuiltinID) {
@@ -3814,10 +3784,6 @@ Value *CodeGenFunction::EmitAArch64Built
     Int = usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
   }
-  case NEON::BI__builtin_neon_vpaddq_v: {
-    Int = Intrinsic::arm_neon_vpadd;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpadd");
-  }
   case NEON::BI__builtin_neon_vmulx_v:
   case NEON::BI__builtin_neon_vmulxq_v: {
     Int = Intrinsic::aarch64_neon_vmulx;
@@ -4220,8 +4186,13 @@ Value *CodeGenFunction::EmitARMBuiltinEx
 
   // Many NEON builtins have identical semantics and uses in ARM and
   // AArch64. Emit these in a single function.
-  if (Value *Result = EmitCommonNeonBuiltinExpr(BuiltinID, E, Ops, Align))
-    return Result;
+  llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap(ARMSIMDIntrinsicMap);
+  const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
+      IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
+  if (Builtin)
+    return EmitCommonNeonBuiltinExpr(
+        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
+        Builtin->NameHint, Builtin->TypeModifier, E, Ops, Align);
 
   unsigned Int;
   switch (BuiltinID) {

Modified: cfe/trunk/lib/CodeGen/CodeGenFunction.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenFunction.h?rev=201848&r1=201847&r2=201848&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CodeGenFunction.h (original)
+++ cfe/trunk/lib/CodeGen/CodeGenFunction.h Fri Feb 21 05:57:24 2014
@@ -2179,11 +2179,18 @@ public:
   llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty);
   llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
-  llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
+
+  llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID,
+                                         unsigned LLVMIntrinsic,
+                                         unsigned AltLLVMIntrinsic,
+                                         const char *NameHint,
+                                         unsigned Modifier,
+                                         const CallExpr *E,
                                          SmallVectorImpl<llvm::Value *> &Ops,
                                          llvm::Value *Align = 0);
-  llvm::Function *LookupNeonIntrinsic(unsigned IntrinsicID, unsigned Modifier,
-                                      llvm::Type *ArgTy, const CallExpr *E);
+  llvm::Function *LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
+                                          unsigned Modifier, llvm::Type *ArgTy,
+                                          const CallExpr *E);
   llvm::Value *EmitNeonCall(llvm::Function *F,
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,





More information about the cfe-commits mailing list