[llvm-branch-commits] [clang] [CIR][AArch64] Add missing lowerings for vceqz_* Neon builtins (PR #184402)

Andrzej WarzyƄski via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Mar 5 08:17:01 PST 2026


https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/184402

>From aad35a51a63d8b7836e73d49bbaeb4e96739a30e Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Tue, 3 Mar 2026 16:38:13 +0000
Subject: [PATCH 1/2] [CIR][AArch64] Add missing lowerings for vceqz_* NEON
 builtins

Implement the remaining CIR lowerings for the AdvSIMD (NEON)
`vceqz{|q|d|s}_*` intrinsic group (bitwise equal to zero).

The `vceqzd_s64` variant was already supported; this patch completes
the rest of the group.

Tests for these intrinsics are moved from:
  test/CodeGen/AArch64/neon-misc.c
to:
  test/CodeGen/AArch64/neon/intrinsics.c

The implementation largely mirrors the existing lowering in
CodeGen/TargetBuiltins/ARM.cpp.

`emitCommonNeonBuiltinExpr` is introduced to support these lowerings.
`getNeonType` is moved without functional changes.

Reference:
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#bitwise-equal-to-zero
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 794 ++++++++++++++++--
 clang/lib/CIR/FrontendAction/CIRGenAction.cpp |   2 +
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |   6 +
 clang/test/CodeGen/AArch64/neon-misc.c        | 307 +------
 clang/test/CodeGen/AArch64/neon/intrinsics.c  | 378 ++++++++-
 5 files changed, 1105 insertions(+), 382 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index be825a5f2f234..d495b02eb7fbd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -47,12 +47,34 @@ static mlir::Value genVscaleTimesFactor(mlir::Location loc,
 //
 // TODO(cir): Share this code with ARM.cpp
 //===----------------------------------------------------------------------===//
-static bool aarch64SVEIntrinsicsProvenSorted = false;
+enum {
+  AddRetType = (1 << 0),
+  Add1ArgType = (1 << 1),
+  Add2ArgTypes = (1 << 2),
+
+  VectorizeRetType = (1 << 3),
+  VectorizeArgTypes = (1 << 4),
+
+  InventFloatType = (1 << 5),
+  UnsignedAlts = (1 << 6),
+
+  Use64BitVectors = (1 << 7),
+  Use128BitVectors = (1 << 8),
+
+  Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
+  VectorRet = AddRetType | VectorizeRetType,
+  VectorRetGetArgs01 =
+      AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
+  FpCmpzModifiers =
+      AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
+};
 
 namespace {
 struct ARMVectorIntrinsicInfo {
+  const char *nameHint;
   unsigned builtinID;
   unsigned llvmIntrinsic;
+  unsigned altLLVMIntrinsic;
   uint64_t typeModifier;
 
   bool operator<(unsigned rhsBuiltinID) const {
@@ -64,17 +86,344 @@ struct ARMVectorIntrinsicInfo {
 };
 } // end anonymous namespace
 
-#define SVEMAP1(NameBase, llvmIntrinsic, TypeModifier)                         \
-  {SVE::BI__builtin_sve_##NameBase, Intrinsic::llvmIntrinsic, TypeModifier}
+#define NEONMAP0(NameBase)                                                     \
+  {#NameBase, NEON::BI__builtin_neon_##NameBase, 0, 0, 0}
+
+#define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier)                        \
+  {#NameBase, NEON::BI__builtin_neon_##NameBase, Intrinsic::LLVMIntrinsic, 0,  \
+   TypeModifier}
+
+#define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier)      \
+  {#NameBase, NEON::BI__builtin_neon_##NameBase, Intrinsic::LLVMIntrinsic,     \
+   Intrinsic::AltLLVMIntrinsic, TypeModifier}
+
+static const armVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
+    NEONMAP0(splat_lane_v),
+    NEONMAP0(splat_laneq_v),
+    NEONMAP0(splatq_lane_v),
+    NEONMAP0(splatq_laneq_v),
+    NEONMAP1(vabs_v, aarch64_neon_abs, 0),
+    NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
+    NEONMAP0(vadd_v),
+    NEONMAP0(vaddhn_v),
+    NEONMAP0(vaddq_p128),
+    NEONMAP0(vaddq_v),
+    NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
+    NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
+    NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
+    NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
+    NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
+    NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
+    NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
+    NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
+    NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
+    NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
+    NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
+    NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
+    NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
+    NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
+    NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
+    NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
+    NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
+    NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
+    NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
+    NEONMAP1(vcage_v, aarch64_neon_facge, 0),
+    NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
+    NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
+    NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
+    NEONMAP1(vcale_v, aarch64_neon_facge, 0),
+    NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
+    NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
+    NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
+    NEONMAP0(vceqz_v),
+    NEONMAP0(vceqzq_v),
+    NEONMAP0(vcgez_v),
+    NEONMAP0(vcgezq_v),
+    NEONMAP0(vcgtz_v),
+    NEONMAP0(vcgtzq_v),
+    NEONMAP0(vclez_v),
+    NEONMAP0(vclezq_v),
+    NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
+    NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
+    NEONMAP0(vcltz_v),
+    NEONMAP0(vcltzq_v),
+    NEONMAP1(vclz_v, ctlz, Add1ArgType),
+    NEONMAP1(vclzq_v, ctlz, Add1ArgType),
+    NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
+    NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
+    NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
+    NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
+    NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
+    NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
+    NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
+    NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
+    NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
+    NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
+    NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
+    NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
+    NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
+    NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
+    NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
+    NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
+    NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
+    NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
+    NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
+    NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
+    NEONMAP1(vcnt_v, ctpop, Add1ArgType),
+    NEONMAP1(vcntq_v, ctpop, Add1ArgType),
+    NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
+    NEONMAP0(vcvt_f16_s16),
+    NEONMAP0(vcvt_f16_u16),
+    NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
+    NEONMAP0(vcvt_f32_v),
+    NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
+    NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
+    NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
+    NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
+    NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
+    NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
+    NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
+    NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
+    NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
+    NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
+    NEONMAP0(vcvtq_f16_s16),
+    NEONMAP0(vcvtq_f16_u16),
+    NEONMAP0(vcvtq_f32_v),
+    NEONMAP0(vcvtq_high_bf16_f32),
+    NEONMAP0(vcvtq_low_bf16_f32),
+    NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
+    NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
+    NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp,
+             0),
+    NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp,
+             0),
+    NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
+    NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
+    NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
+    NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
+    NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
+    NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
+    NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
+    NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
+    NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
+    NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
+    NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
+    NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP0(vext_v),
+    NEONMAP0(vextq_v),
+    NEONMAP0(vfma_v),
+    NEONMAP0(vfmaq_v),
+    NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
+    NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
+    NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
+    NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
+    NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
+    NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
+    NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
+    NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
+    NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
+    NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
+    NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
+    NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
+    NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
+    NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
+    NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
+    NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
+    NEONMAP0(vmovl_v),
+    NEONMAP0(vmovn_v),
+    NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
+    NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
+    NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
+    NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
+    NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
+    NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
+    NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
+    NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
+    NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
+    NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
+    NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
+    NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
+    NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
+    NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
+    NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
+    NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
+    NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
+    NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
+    NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
+    NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
+    NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
+    NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
+    NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
+    NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
+    NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
+    NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
+    NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
+    NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
+    NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+    NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
+    NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
+    NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+    NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
+    NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
+    NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
+    NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
+    NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
+    NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
+    NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
+    NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
+    NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
+    NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
+    NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
+    NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
+    NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
+    NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType),
+    NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
+    NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType),
+    NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
+    NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType),
+    NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
+    NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType),
+    NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
+    NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType),
+    NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
+    NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType),
+    NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
+    NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType),
+    NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
+    NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType),
+    NEONMAP0(vrndi_v),
+    NEONMAP0(vrndiq_v),
+    NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
+    NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
+    NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
+    NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
+    NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
+    NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
+    NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
+    NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
+    NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
+    NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
+    NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
+    NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
+    NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
+    NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
+    NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
+    NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
+    NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
+    NEONMAP0(vshl_n_v),
+    NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP0(vshll_n_v),
+    NEONMAP0(vshlq_n_v),
+    NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl,
+             Add1ArgType | UnsignedAlts),
+    NEONMAP0(vshr_n_v),
+    NEONMAP0(vshrn_n_v),
+    NEONMAP0(vshrq_n_v),
+    NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
+    NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
+    NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
+    NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
+    NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
+    NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
+    NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
+    NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
+    NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
+    NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
+    NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
+    NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
+    NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
+    NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
+    NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
+    NEONMAP0(vsubhn_v),
+    NEONMAP0(vtst_v),
+    NEONMAP0(vtstq_v),
+    NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
+    NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
+    NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
+    NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
+};
+
+#define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
+  {#NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0,    \
+   TypeModifier}
 
 #define SVEMAP2(NameBase, TypeModifier)                                        \
-  {SVE::BI__builtin_sve_##NameBase, 0, TypeModifier}
-static const ARMVectorIntrinsicInfo aarch64SVEIntrinsicMap[] = {
+  {#NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier}
+static const armVectorIntrinsicInfo aarch64SVEIntrinsicMap[] = {
 #define GET_SVE_LLVM_INTRINSIC_MAP
 #include "clang/Basic/arm_sve_builtin_cg.inc"
 #undef GET_SVE_LLVM_INTRINSIC_MAP
 };
 
+static bool aarch64SIMDIntrinsicsProvenSorted = false;
+static bool aarch64SVEIntrinsicsProvenSorted = false;
+
 // Check if Builtin `builtinId` is present in `intrinsicMap`. If yes, returns
 // the corresponding info struct.
 static const ARMVectorIntrinsicInfo *
@@ -110,16 +459,13 @@ emitAArch64CompareBuiltinExpr(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
     assert(!cast<cir::VectorType>(retTy).getIsScalable() &&
            "This is only intended for fixed-width vectors");
     // Vector types are cast to i8 vectors. Recover original type.
-    cgf.cgm.errorNYI(loc, std::string("unimplemented vector compare"));
+    src = builder.createBitcast(src, retTy);
   }
 
   mlir::Value zero = builder.getNullValue(src.getType(), loc);
-  if (cir::isFPOrVectorOfFPType(src.getType())) {
-    cgf.cgm.errorNYI(loc, std::string("unimplemented FP compare"));
-  }
 
   if (!scalarCmp)
-    cgf.cgm.errorNYI(loc, std::string("unimplemented vector compare"));
+    return builder.createVecCompare(loc, kind, src, zero);
 
   // For scalars, cast !cir.bool to !cir.int<s, 1> so that the compare
   // result is sign- rather zero-extended when casting to the output
@@ -131,6 +477,364 @@ emitAArch64CompareBuiltinExpr(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
   return builder.createCast(loc, cir::CastKind::integral, cmp, retTy);
 }
 
+// TODO(cir): Remove `loc` from the list of arguments once all NYIs are gone.
+static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
+                                   mlir::Location loc,
+                                   bool hasLegalHalfType = true,
+                                   bool v1Ty = false,
+                                   bool allowBFloatArgsAndRet = true) {
+  int isQuad = typeFlags.isQuad();
+  switch (typeFlags.getEltType()) {
+  case NeonTypeFlags::Int8:
+  case NeonTypeFlags::Poly8:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt8Ty
+                                                       : cgf->sInt8Ty,
+                                v1Ty ? 1 : (8 << isQuad));
+  case NeonTypeFlags::MFloat8:
+    cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: MFloat8"));
+    [[fallthrough]];
+  case NeonTypeFlags::Int16:
+  case NeonTypeFlags::Poly16:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt16Ty
+                                                       : cgf->sInt16Ty,
+                                v1Ty ? 1 : (4 << isQuad));
+  case NeonTypeFlags::BFloat16:
+    if (allowBFloatArgsAndRet)
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
+    else
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
+    [[fallthrough]];
+  case NeonTypeFlags::Float16:
+    if (hasLegalHalfType)
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
+    else
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
+    [[fallthrough]];
+  case NeonTypeFlags::Int32:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
+                                                       : cgf->sInt32Ty,
+                                v1Ty ? 1 : (2 << isQuad));
+  case NeonTypeFlags::Int64:
+  case NeonTypeFlags::Poly64:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt64Ty
+                                                       : cgf->sInt64Ty,
+                                v1Ty ? 1 : (1 << isQuad));
+  case NeonTypeFlags::Poly128:
+    // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
+    // There is a lot of i128 and f128 API missing.
+    // so we use v16i8 to represent poly128 and get pattern matched.
+    cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Poly128"));
+    [[fallthrough]];
+  case NeonTypeFlags::Float32:
+    return cir::VectorType::get(cgf->getCIRGenModule().floatTy,
+                                v1Ty ? 1 : (2 << isQuad));
+  case NeonTypeFlags::Float64:
+    return cir::VectorType::get(cgf->getCIRGenModule().doubleTy,
+                                v1Ty ? 1 : (1 << isQuad));
+  }
+  llvm_unreachable("Unknown vector element type!");
+}
+
+static mlir::Value emitCommonNeonBuiltinExpr(
+    CIRGenFunction &cgf, unsigned builtinID, unsigned llvmIntrinsic,
+    unsigned altLLVMIntrinsic, const char *nameHint, unsigned modifier,
+    const CallExpr *expr, llvm::SmallVectorImpl<mlir::Value> &ops) {
+
+  mlir::Location loc = cgf.getLoc(expr->getExprLoc());
+  clang::ASTContext &ctx = cgf.getContext();
+
+  // Extract the trailing immediate argument that encodes the type discriminator
+  // for this overloaded intrinsic.
+  // TODO: Move to the parent code that takes care of argument processing.
+  const clang::Expr *arg = expr->getArg(expr->getNumArgs() - 1);
+  std::optional<llvm::APSInt> neonTypeConst = arg->getIntegerConstantExpr(ctx);
+  if (!neonTypeConst)
+    return nullptr;
+
+  // Determine the type of this overloaded NEON intrinsic.
+  NeonTypeFlags neonType(neonTypeConst->getZExtValue());
+  const bool hasLegalHalfType = cgf.getTarget().hasFastHalfType();
+
+  // The value of allowBFloatArgsAndRet is true for AArch64, but it should
+  // come from ABI info.
+  const bool allowBFloatArgsAndRet = false;
+  // FIXME
+  // getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
+
+  cir::VectorType vTy = getNeonType(&cgf, neonType, loc, hasLegalHalfType,
+                                    false, allowBFloatArgsAndRet);
+  mlir::Type ty = vTy;
+  if (!ty)
+    return nullptr;
+
+  switch (builtinID) {
+  case NEON::BI__builtin_neon_splat_lane_v:
+  case NEON::BI__builtin_neon_splat_laneq_v:
+  case NEON::BI__builtin_neon_splatq_lane_v:
+  case NEON::BI__builtin_neon_splatq_laneq_v:
+  case NEON::BI__builtin_neon_vpadd_v:
+  case NEON::BI__builtin_neon_vpaddq_v:
+  case NEON::BI__builtin_neon_vabs_v:
+  case NEON::BI__builtin_neon_vabsq_v:
+  case NEON::BI__builtin_neon_vadd_v:
+  case NEON::BI__builtin_neon_vaddq_v:
+  case NEON::BI__builtin_neon_vaddhn_v:
+  case NEON::BI__builtin_neon_vcale_v:
+  case NEON::BI__builtin_neon_vcaleq_v:
+  case NEON::BI__builtin_neon_vcalt_v:
+  case NEON::BI__builtin_neon_vcaltq_v:
+  case NEON::BI__builtin_neon_vcage_v:
+  case NEON::BI__builtin_neon_vcageq_v:
+  case NEON::BI__builtin_neon_vcagt_v:
+  case NEON::BI__builtin_neon_vcagtq_v:
+    cgf.cgm.errorNYI(expr->getSourceRange(),
+                     std::string("unimplemented AArch64 builtin call: ") +
+                         ctx.BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vceqz_v:
+  case NEON::BI__builtin_neon_vceqzq_v:
+    return emitAArch64CompareBuiltinExpr(cgf, cgf.getBuilder(), loc, ops[0],
+                                         vTy, cir::CmpOpKind::eq);
+  case NEON::BI__builtin_neon_vcgez_v:
+  case NEON::BI__builtin_neon_vcgezq_v:
+  case NEON::BI__builtin_neon_vclez_v:
+  case NEON::BI__builtin_neon_vclezq_v:
+  case NEON::BI__builtin_neon_vcgtz_v:
+  case NEON::BI__builtin_neon_vcgtzq_v:
+  case NEON::BI__builtin_neon_vcltz_v:
+  case NEON::BI__builtin_neon_vcltzq_v:
+  case NEON::BI__builtin_neon_vclz_v:
+  case NEON::BI__builtin_neon_vclzq_v:
+  case NEON::BI__builtin_neon_vcvt_f32_v:
+  case NEON::BI__builtin_neon_vcvtq_f32_v:
+  case NEON::BI__builtin_neon_vcvt_f16_s16:
+  case NEON::BI__builtin_neon_vcvt_f16_u16:
+  case NEON::BI__builtin_neon_vcvtq_f16_s16:
+  case NEON::BI__builtin_neon_vcvtq_f16_u16:
+  case NEON::BI__builtin_neon_vcvt_n_f16_s16:
+  case NEON::BI__builtin_neon_vcvt_n_f16_u16:
+  case NEON::BI__builtin_neon_vcvtq_n_f16_s16:
+  case NEON::BI__builtin_neon_vcvtq_n_f16_u16:
+  case NEON::BI__builtin_neon_vcvt_n_f32_v:
+  case NEON::BI__builtin_neon_vcvt_n_f64_v:
+  case NEON::BI__builtin_neon_vcvtq_n_f32_v:
+  case NEON::BI__builtin_neon_vcvtq_n_f64_v:
+  case NEON::BI__builtin_neon_vcvt_n_s16_f16:
+  case NEON::BI__builtin_neon_vcvt_n_s32_v:
+  case NEON::BI__builtin_neon_vcvt_n_u16_f16:
+  case NEON::BI__builtin_neon_vcvt_n_u32_v:
+  case NEON::BI__builtin_neon_vcvt_n_s64_v:
+  case NEON::BI__builtin_neon_vcvt_n_u64_v:
+  case NEON::BI__builtin_neon_vcvtq_n_s16_f16:
+  case NEON::BI__builtin_neon_vcvtq_n_s32_v:
+  case NEON::BI__builtin_neon_vcvtq_n_u16_f16:
+  case NEON::BI__builtin_neon_vcvtq_n_u32_v:
+  case NEON::BI__builtin_neon_vcvtq_n_s64_v:
+  case NEON::BI__builtin_neon_vcvtq_n_u64_v:
+  case NEON::BI__builtin_neon_vcvt_s32_v:
+  case NEON::BI__builtin_neon_vcvt_u32_v:
+  case NEON::BI__builtin_neon_vcvt_s64_v:
+  case NEON::BI__builtin_neon_vcvt_u64_v:
+  case NEON::BI__builtin_neon_vcvt_s16_f16:
+  case NEON::BI__builtin_neon_vcvt_u16_f16:
+  case NEON::BI__builtin_neon_vcvtq_s32_v:
+  case NEON::BI__builtin_neon_vcvtq_u32_v:
+  case NEON::BI__builtin_neon_vcvtq_s64_v:
+  case NEON::BI__builtin_neon_vcvtq_u64_v:
+  case NEON::BI__builtin_neon_vcvtq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtq_u16_f16:
+  case NEON::BI__builtin_neon_vcvta_s16_f16:
+  case NEON::BI__builtin_neon_vcvta_s32_v:
+  case NEON::BI__builtin_neon_vcvta_s64_v:
+  case NEON::BI__builtin_neon_vcvta_u16_f16:
+  case NEON::BI__builtin_neon_vcvta_u32_v:
+  case NEON::BI__builtin_neon_vcvta_u64_v:
+  case NEON::BI__builtin_neon_vcvtaq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtaq_s32_v:
+  case NEON::BI__builtin_neon_vcvtaq_s64_v:
+  case NEON::BI__builtin_neon_vcvtaq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtaq_u32_v:
+  case NEON::BI__builtin_neon_vcvtaq_u64_v:
+  case NEON::BI__builtin_neon_vcvtn_s16_f16:
+  case NEON::BI__builtin_neon_vcvtn_s32_v:
+  case NEON::BI__builtin_neon_vcvtn_s64_v:
+  case NEON::BI__builtin_neon_vcvtn_u16_f16:
+  case NEON::BI__builtin_neon_vcvtn_u32_v:
+  case NEON::BI__builtin_neon_vcvtn_u64_v:
+  case NEON::BI__builtin_neon_vcvtnq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtnq_s32_v:
+  case NEON::BI__builtin_neon_vcvtnq_s64_v:
+  case NEON::BI__builtin_neon_vcvtnq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtnq_u32_v:
+  case NEON::BI__builtin_neon_vcvtnq_u64_v:
+  case NEON::BI__builtin_neon_vcvtp_s16_f16:
+  case NEON::BI__builtin_neon_vcvtp_s32_v:
+  case NEON::BI__builtin_neon_vcvtp_s64_v:
+  case NEON::BI__builtin_neon_vcvtp_u16_f16:
+  case NEON::BI__builtin_neon_vcvtp_u32_v:
+  case NEON::BI__builtin_neon_vcvtp_u64_v:
+  case NEON::BI__builtin_neon_vcvtpq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtpq_s32_v:
+  case NEON::BI__builtin_neon_vcvtpq_s64_v:
+  case NEON::BI__builtin_neon_vcvtpq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtpq_u32_v:
+  case NEON::BI__builtin_neon_vcvtpq_u64_v:
+  case NEON::BI__builtin_neon_vcvtm_s16_f16:
+  case NEON::BI__builtin_neon_vcvtm_s32_v:
+  case NEON::BI__builtin_neon_vcvtm_s64_v:
+  case NEON::BI__builtin_neon_vcvtm_u16_f16:
+  case NEON::BI__builtin_neon_vcvtm_u32_v:
+  case NEON::BI__builtin_neon_vcvtm_u64_v:
+  case NEON::BI__builtin_neon_vcvtmq_s16_f16:
+  case NEON::BI__builtin_neon_vcvtmq_s32_v:
+  case NEON::BI__builtin_neon_vcvtmq_s64_v:
+  case NEON::BI__builtin_neon_vcvtmq_u16_f16:
+  case NEON::BI__builtin_neon_vcvtmq_u32_v:
+  case NEON::BI__builtin_neon_vcvtmq_u64_v:
+  case NEON::BI__builtin_neon_vcvtx_f32_v:
+  case NEON::BI__builtin_neon_vext_v:
+  case NEON::BI__builtin_neon_vextq_v:
+  case NEON::BI__builtin_neon_vfma_v:
+  case NEON::BI__builtin_neon_vfmaq_v:
+  case NEON::BI__builtin_neon_vld1_v:
+  case NEON::BI__builtin_neon_vld1q_v:
+  case NEON::BI__builtin_neon_vld1_x2_v:
+  case NEON::BI__builtin_neon_vld1q_x2_v:
+  case NEON::BI__builtin_neon_vld1_x3_v:
+  case NEON::BI__builtin_neon_vld1q_x3_v:
+  case NEON::BI__builtin_neon_vld1_x4_v:
+  case NEON::BI__builtin_neon_vld1q_x4_v:
+  case NEON::BI__builtin_neon_vld2_v:
+  case NEON::BI__builtin_neon_vld2q_v:
+  case NEON::BI__builtin_neon_vld3_v:
+  case NEON::BI__builtin_neon_vld3q_v:
+  case NEON::BI__builtin_neon_vld4_v:
+  case NEON::BI__builtin_neon_vld4q_v:
+  case NEON::BI__builtin_neon_vld2_dup_v:
+  case NEON::BI__builtin_neon_vld2q_dup_v:
+  case NEON::BI__builtin_neon_vld3_dup_v:
+  case NEON::BI__builtin_neon_vld3q_dup_v:
+  case NEON::BI__builtin_neon_vld4_dup_v:
+  case NEON::BI__builtin_neon_vld4q_dup_v:
+  case NEON::BI__builtin_neon_vld1_dup_v:
+  case NEON::BI__builtin_neon_vld1q_dup_v:
+  case NEON::BI__builtin_neon_vld2_lane_v:
+  case NEON::BI__builtin_neon_vld2q_lane_v:
+  case NEON::BI__builtin_neon_vld3_lane_v:
+  case NEON::BI__builtin_neon_vld3q_lane_v:
+  case NEON::BI__builtin_neon_vld4_lane_v:
+  case NEON::BI__builtin_neon_vld4q_lane_v:
+  case NEON::BI__builtin_neon_vmovl_v:
+  case NEON::BI__builtin_neon_vmovn_v:
+  case NEON::BI__builtin_neon_vmull_v:
+  case NEON::BI__builtin_neon_vpadal_v:
+  case NEON::BI__builtin_neon_vpadalq_v:
+  case NEON::BI__builtin_neon_vpaddl_v:
+  case NEON::BI__builtin_neon_vpaddlq_v:
+  case NEON::BI__builtin_neon_vqdmlal_v:
+  case NEON::BI__builtin_neon_vqdmlsl_v:
+  case NEON::BI__builtin_neon_vqdmulhq_lane_v:
+  case NEON::BI__builtin_neon_vqdmulh_lane_v:
+  case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
+  case NEON::BI__builtin_neon_vqrdmulh_lane_v:
+  case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
+  case NEON::BI__builtin_neon_vqdmulh_laneq_v:
+  case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
+  case NEON::BI__builtin_neon_vqrdmulh_laneq_v:
+  case NEON::BI__builtin_neon_vqshl_n_v:
+  case NEON::BI__builtin_neon_vqshlq_n_v:
+  case NEON::BI__builtin_neon_vqshlu_n_v:
+  case NEON::BI__builtin_neon_vqshluq_n_v:
+  case NEON::BI__builtin_neon_vrecpe_v:
+  case NEON::BI__builtin_neon_vrecpeq_v:
+  case NEON::BI__builtin_neon_vrsqrte_v:
+  case NEON::BI__builtin_neon_vrsqrteq_v:
+  case NEON::BI__builtin_neon_vrndi_v:
+  case NEON::BI__builtin_neon_vrndiq_v:
+  case NEON::BI__builtin_neon_vrshr_n_v:
+  case NEON::BI__builtin_neon_vrshrq_n_v:
+  case NEON::BI__builtin_neon_vsha512hq_u64:
+  case NEON::BI__builtin_neon_vsha512h2q_u64:
+  case NEON::BI__builtin_neon_vsha512su0q_u64:
+  case NEON::BI__builtin_neon_vsha512su1q_u64:
+  case NEON::BI__builtin_neon_vshl_n_v:
+  case NEON::BI__builtin_neon_vshlq_n_v:
+  case NEON::BI__builtin_neon_vshll_n_v:
+  case NEON::BI__builtin_neon_vshrn_n_v:
+  case NEON::BI__builtin_neon_vshr_n_v:
+  case NEON::BI__builtin_neon_vshrq_n_v:
+  case NEON::BI__builtin_neon_vst1_v:
+  case NEON::BI__builtin_neon_vst1q_v:
+  case NEON::BI__builtin_neon_vst2_v:
+  case NEON::BI__builtin_neon_vst2q_v:
+  case NEON::BI__builtin_neon_vst3_v:
+  case NEON::BI__builtin_neon_vst3q_v:
+  case NEON::BI__builtin_neon_vst4_v:
+  case NEON::BI__builtin_neon_vst4q_v:
+  case NEON::BI__builtin_neon_vst2_lane_v:
+  case NEON::BI__builtin_neon_vst2q_lane_v:
+  case NEON::BI__builtin_neon_vst3_lane_v:
+  case NEON::BI__builtin_neon_vst3q_lane_v:
+  case NEON::BI__builtin_neon_vst4_lane_v:
+  case NEON::BI__builtin_neon_vst4q_lane_v:
+  case NEON::BI__builtin_neon_vsm3partw1q_u32:
+  case NEON::BI__builtin_neon_vsm3partw2q_u32:
+  case NEON::BI__builtin_neon_vsm3ss1q_u32:
+  case NEON::BI__builtin_neon_vsm4ekeyq_u32:
+  case NEON::BI__builtin_neon_vsm4eq_u32:
+  case NEON::BI__builtin_neon_vsm3tt1aq_u32:
+  case NEON::BI__builtin_neon_vsm3tt1bq_u32:
+  case NEON::BI__builtin_neon_vsm3tt2aq_u32:
+  case NEON::BI__builtin_neon_vsm3tt2bq_u32:
+  case NEON::BI__builtin_neon_vst1_x2_v:
+  case NEON::BI__builtin_neon_vst1q_x2_v:
+  case NEON::BI__builtin_neon_vst1_x3_v:
+  case NEON::BI__builtin_neon_vst1q_x3_v:
+  case NEON::BI__builtin_neon_vst1_x4_v:
+  case NEON::BI__builtin_neon_vst1q_x4_v:
+  case NEON::BI__builtin_neon_vsubhn_v:
+  case NEON::BI__builtin_neon_vtrn_v:
+  case NEON::BI__builtin_neon_vtrnq_v:
+  case NEON::BI__builtin_neon_vtst_v:
+  case NEON::BI__builtin_neon_vtstq_v:
+  case NEON::BI__builtin_neon_vuzp_v:
+  case NEON::BI__builtin_neon_vuzpq_v:
+  case NEON::BI__builtin_neon_vxarq_u64:
+  case NEON::BI__builtin_neon_vzip_v:
+  case NEON::BI__builtin_neon_vzipq_v:
+  case NEON::BI__builtin_neon_vdot_s32:
+  case NEON::BI__builtin_neon_vdot_u32:
+  case NEON::BI__builtin_neon_vdotq_s32:
+  case NEON::BI__builtin_neon_vdotq_u32:
+  case NEON::BI__builtin_neon_vfmlal_low_f16:
+  case NEON::BI__builtin_neon_vfmlalq_low_f16:
+  case NEON::BI__builtin_neon_vfmlsl_low_f16:
+  case NEON::BI__builtin_neon_vfmlslq_low_f16:
+  case NEON::BI__builtin_neon_vfmlal_high_f16:
+  case NEON::BI__builtin_neon_vfmlalq_high_f16:
+  case NEON::BI__builtin_neon_vfmlsl_high_f16:
+  case NEON::BI__builtin_neon_vfmlslq_high_f16:
+  case NEON::BI__builtin_neon_vmmlaq_s32:
+  case NEON::BI__builtin_neon_vmmlaq_u32:
+  case NEON::BI__builtin_neon_vusmmlaq_s32:
+  case NEON::BI__builtin_neon_vusdot_s32:
+  case NEON::BI__builtin_neon_vusdotq_s32:
+  case NEON::BI__builtin_neon_vbfdot_f32:
+  case NEON::BI__builtin_neon_vbfdotq_f32:
+  case NEON::BI__builtin_neon___a32_vcvt_bf16_f32:
+  default:
+    cgf.cgm.errorNYI(expr->getSourceRange(),
+                     std::string("unimplemented AArch64 builtin call: ") +
+                         ctx.BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+
+    cgf.cgm.errorNYI(expr->getSourceRange(),
+                     std::string("unimplemented AArch64 builtin call: ") +
+                         ctx.BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  }
+}
+
 // Emit an intrinsic where all operands are of the same type as the result.
 // Depending on mode, this may be a constrained floating-point intrinsic.
 static mlir::Value
@@ -298,64 +1002,6 @@ static bool hasExtraNeonArgument(unsigned builtinID) {
   return mask != 0;
 }
 
-// TODO(cir): Remove `loc` from the list of arguments once all NYIs are gone.
-static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
-                                   mlir::Location loc,
-                                   bool hasLegalHalfType = true,
-                                   bool v1Ty = false,
-                                   bool allowBFloatArgsAndRet = true) {
-  int isQuad = typeFlags.isQuad();
-  switch (typeFlags.getEltType()) {
-  case NeonTypeFlags::Int8:
-  case NeonTypeFlags::Poly8:
-    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt8Ty
-                                                       : cgf->sInt8Ty,
-                                v1Ty ? 1 : (8 << isQuad));
-  case NeonTypeFlags::MFloat8:
-    cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: MFloat8"));
-    [[fallthrough]];
-  case NeonTypeFlags::Int16:
-  case NeonTypeFlags::Poly16:
-    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt16Ty
-                                                       : cgf->sInt16Ty,
-                                v1Ty ? 1 : (4 << isQuad));
-  case NeonTypeFlags::BFloat16:
-    if (allowBFloatArgsAndRet)
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
-    else
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
-    [[fallthrough]];
-  case NeonTypeFlags::Float16:
-    if (hasLegalHalfType)
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    else
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    [[fallthrough]];
-  case NeonTypeFlags::Int32:
-    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
-                                                       : cgf->sInt32Ty,
-                                v1Ty ? 1 : (2 << isQuad));
-  case NeonTypeFlags::Int64:
-  case NeonTypeFlags::Poly64:
-    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt64Ty
-                                                       : cgf->sInt64Ty,
-                                v1Ty ? 1 : (1 << isQuad));
-  case NeonTypeFlags::Poly128:
-    // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
-    // There is a lot of i128 and f128 API missing.
-    // so we use v16i8 to represent poly128 and get pattern matched.
-    cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Poly128"));
-    [[fallthrough]];
-  case NeonTypeFlags::Float32:
-    return cir::VectorType::get(cgf->getCIRGenModule().floatTy,
-                                v1Ty ? 1 : (2 << isQuad));
-  case NeonTypeFlags::Float64:
-    return cir::VectorType::get(cgf->getCIRGenModule().doubleTy,
-                                v1Ty ? 1 : (1 << isQuad));
-  }
-  llvm_unreachable("Unknown vector element type!");
-}
-
 // TODO(cir): Remove `cgm` from the list of arguments once all NYI(s) are gone.
 template <typename Operation>
 static mlir::Value
@@ -1585,6 +2231,18 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
 
   mlir::Location loc = getLoc(expr->getExprLoc());
 
+  // Not all intrinsics handled by the common case work for AArch64 yet, so only
+  // defer to common code if it's been added to our special map.
+  const armVectorIntrinsicInfo *builtin;
+  builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, builtinID,
+                                        aarch64SIMDIntrinsicsProvenSorted);
+
+  if (builtin)
+    return emitCommonNeonBuiltinExpr(
+        *this, builtin->builtinID, builtin->llvmIntrinsic,
+        builtin->altLLVMIntrinsic, builtin->nameHint, builtin->typeModifier,
+        expr, ops);
+
   // Handle non-overloaded intrinsics first.
   switch (builtinID) {
   default:
diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
index 19c407545b961..b15d7f699119c 100644
--- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
+++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
@@ -102,6 +102,8 @@ class CIRGenConsumer : public clang::ASTConsumer {
 
     if (!FEOptions.ClangIRDisableCIRVerifier) {
       if (!Gen->verifyModule()) {
+        // HACK!!
+        Gen->getModule().dump();
         CI.getDiagnostics().Report(
             diag::err_cir_verification_failed_pre_passes);
         llvm::report_fatal_error(
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index aa95e92b9f2e9..52261c97b0d2e 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -561,6 +561,12 @@ enum {
       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
 };
 
+//===----------------------------------------------------------------------===//
+//  Intrinsic maps
+//
+//  Maps that help automate code-generation.
+//===----------------------------------------------------------------------===//
+
 namespace {
 struct ARMVectorIntrinsicInfo {
   const char *NameHint;
diff --git a/clang/test/CodeGen/AArch64/neon-misc.c b/clang/test/CodeGen/AArch64/neon-misc.c
index 6eadaaf27a210..ac2c83aa03ccf 100644
--- a/clang/test/CodeGen/AArch64/neon-misc.c
+++ b/clang/test/CodeGen/AArch64/neon-misc.c
@@ -7,313 +7,8 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_s8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[VCEQZ_I]]
-//
-uint8x8_t test_vceqz_s8(int8x8_t a) {
-  return vceqz_s8(a);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_s16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[VCEQZ_I]]
-//
-uint16x4_t test_vceqz_s16(int16x4_t a) {
-  return vceqz_s16(a);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_s32(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[VCEQZ_I]]
-//
-uint32x2_t test_vceqz_s32(int32x2_t a) {
-  return vceqz_s32(a);
-}
-
-// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_s64(
-// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
-//
-uint64x1_t test_vceqz_s64(int64x1_t a) {
-  return vceqz_s64(a);
-}
-
-// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_u64(
-// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
-//
-uint64x1_t test_vceqz_u64(uint64x1_t a) {
-  return vceqz_u64(a);
-}
-
-// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_p64(
-// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
-//
-uint64x1_t test_vceqz_p64(poly64x1_t a) {
-  return vceqz_p64(a);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_s8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[VCEQZ_I]]
-//
-uint8x16_t test_vceqzq_s8(int8x16_t a) {
-  return vceqzq_s8(a);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_s16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[VCEQZ_I]]
-//
-uint16x8_t test_vceqzq_s16(int16x8_t a) {
-  return vceqzq_s16(a);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_s32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[VCEQZ_I]]
-//
-uint32x4_t test_vceqzq_s32(int32x4_t a) {
-  return vceqzq_s32(a);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_s64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
-//
-uint64x2_t test_vceqzq_s64(int64x2_t a) {
-  return vceqzq_s64(a);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_u8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[VCEQZ_I]]
-//
-uint8x8_t test_vceqz_u8(uint8x8_t a) {
-  return vceqz_u8(a);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_u16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[VCEQZ_I]]
-//
-uint16x4_t test_vceqz_u16(uint16x4_t a) {
-  return vceqz_u16(a);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_u32(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[VCEQZ_I]]
-//
-uint32x2_t test_vceqz_u32(uint32x2_t a) {
-  return vceqz_u32(a);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_u8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[VCEQZ_I]]
-//
-uint8x16_t test_vceqzq_u8(uint8x16_t a) {
-  return vceqzq_u8(a);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_u16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[VCEQZ_I]]
-//
-uint16x8_t test_vceqzq_u16(uint16x8_t a) {
-  return vceqzq_u16(a);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_u32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[VCEQZ_I]]
-//
-uint32x4_t test_vceqzq_u32(uint32x4_t a) {
-  return vceqzq_u32(a);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_u64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
-//
-uint64x2_t test_vceqzq_u64(uint64x2_t a) {
-  return vceqzq_u64(a);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_f32(
-// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <2 x float> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[VCEQZ_I]]
-//
-uint32x2_t test_vceqz_f32(float32x2_t a) {
-  return vceqz_f32(a);
-}
-
-// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <1 x double> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
-//
-uint64x1_t test_vceqz_f64(float64x1_t a) {
-  return vceqz_f64(a);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_f32(
-// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[VCEQZ_I]]
-//
-uint32x4_t test_vceqzq_f32(float32x4_t a) {
-  return vceqzq_f32(a);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_p8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[VCEQZ_I]]
-//
-uint8x8_t test_vceqz_p8(poly8x8_t a) {
-  return vceqz_p8(a);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_p8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[VCEQZ_I]]
-//
-uint8x16_t test_vceqzq_p8(poly8x16_t a) {
-  return vceqzq_p8(a);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_f64(
-// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <2 x double> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
-//
-uint64x2_t test_vceqzq_f64(float64x2_t a) {
-  return vceqzq_f64(a);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_p64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
-//
-uint64x2_t test_vceqzq_p64(poly64x2_t a) {
-  return vceqzq_p64(a);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vcgez_s8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]])  #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = icmp sge <8 x i8> [[A]], zeroinitializer
 // CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index 87f56f7997ce9..41f3ba6fda260 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -16,6 +16,20 @@
 
 #include <arm_neon.h>
 
+// LLVM-LABEL: @test_vnegd_s64
+// CIR-LABEL: @vnegd_s64
+int64_t test_vnegd_s64(int64_t a) {
+// CIR: cir.unary(minus, {{.*}}) : !s64
+
+// LLVM-SAME: i64 {{.*}} [[A:%.*]])
+// LLVM:          [[VNEGD_I:%.*]] = sub i64 0, [[A]]
+// LLVM-NEXT:     ret i64 [[VNEGD_I]]
+  return (int64_t)vnegd_s64(a);
+}
+
+//===------------------------------------------------------===//
+// 2.1.2.2 Bitwise equal to zero
+//===------------------------------------------------------===//
 // LLVM-LABEL: @test_vceqzd_s64
 // CIR-LABEL: @vceqzd_s64
 uint64_t test_vceqzd_s64(int64_t a) {
@@ -31,15 +45,363 @@ uint64_t test_vceqzd_s64(int64_t a) {
   return (uint64_t)vceqzd_s64(a);
 }
 
-// LLVM-LABEL: @test_vnegd_s64
-// CIR-LABEL: @vnegd_s64
-int64_t test_vnegd_s64(int64_t a) {
-// CIR: cir.unary(minus, {{.*}}) : !s64
+// LLVM-LABEL: @test_vceqz_s8(
+// CIR-LABEL: @vceqz_s8(
+uint8x8_t test_vceqz_s8(int8x8_t a) {
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<8 x !s8i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<8 x !s8i>
+
+// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// LLVM:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// LLVM-NEXT:    ret <8 x i8> [[VCEQZ_I]]
+  return vceqz_s8(a);
+}
 
-// LLVM-SAME: i64{{.*}} [[A:%.*]])
-// LLVM:          [[VNEGD_I:%.*]] = sub i64 0, [[A]]
-// LLVM-NEXT:     ret i64 [[VNEGD_I]]
-  return (int64_t)vnegd_s64(a);
+// LLVM-LABEL: @test_vceqz_s16(
+// CIR-LABEL: @vceqz_s16(
+uint16x4_t test_vceqz_s16(int16x4_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<4 x !s16i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<4 x !s16i>, !cir.vector<4 x !s16i>
+
+// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// LLVM-NEXT:    ret <4 x i16> [[VCEQZ_I]]
+  return vceqz_s16(a);
+}
+
+// LLVM-LABEL: @test_vceqz_s32(
+// CIR-LABEL: @vceqz_s32(
+uint32x2_t test_vceqz_s32(int32x2_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<2 x !s32i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<2 x !s32i>, !cir.vector<2 x !s32i>
+
+// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// LLVM-NEXT:    ret <2 x i32> [[VCEQZ_I]]
+  return vceqz_s32(a);
+}
+
+// LLVM-LABEL: @test_vceqz_s64(
+// CIR-LABEL: @vceqz_s64(
+uint64x1_t test_vceqz_s64(int64x1_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<1 x !s64i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<1 x !s64i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<1 x !s64i>, !cir.vector<1 x !s64i>
+
+// LLVM-SAME: <1 x i64> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// LLVM-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+  return vceqz_s64(a);
+}
+
+// LLVM-LABEL: @test_vceqz_u64(
+// CIR-LABEL: @vceqz_u64(
+uint64x1_t test_vceqz_u64(uint64x1_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<1 x !u64i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<1 x !u64i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<1 x !u64i>, !cir.vector<1 x !s64i>
+
+// LLVM-SAME: <1 x i64> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// LLVM-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+  return vceqz_u64(a);
+}
+
+// LLVM-LABEL: @test_vceqz_p64(
+// CIR-LABEL: @vceqz_p64(
+uint64x1_t test_vceqz_p64(poly64x1_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<1 x !s64i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<1 x !s64i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<1 x !s64i>, !cir.vector<1 x !s64i>
+
+// LLVM-SAME: <1 x i64> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// LLVM-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+  return vceqz_p64(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_s8(
+// CIR-LABEL: @vceqzq_s8(
+uint8x16_t test_vceqzq_s8(int8x16_t a) {
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<16 x !s8i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<16 x !s8i>
+
+// LLVM-SAME: <16 x i8> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// LLVM-NEXT:    ret <16 x i8> [[VCEQZ_I]]
+  return vceqzq_s8(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_s16(
+// CIR-LABEL: @vceqzq_s16(
+uint16x8_t test_vceqzq_s16(int16x8_t a) {
+// LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// LLVM-NEXT:    ret <8 x i16> [[VCEQZ_I]]
+  return vceqzq_s16(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_s32(
+// CIR-LABEL: @vceqzq_s32(
+uint32x4_t test_vceqzq_s32(int32x4_t a) {
+// LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// LLVM-NEXT:    ret <4 x i32> [[VCEQZ_I]]
+  return vceqzq_s32(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_s64(
+// CIR-LABEL: @vceqzq_s64(
+uint64x2_t test_vceqzq_s64(int64x2_t a) {
+// LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// LLVM-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+  return vceqzq_s64(a);
+}
+
+// LLVM-LABEL: @test_vceqz_u8(
+// CIR-LABEL: @vceqz_u8(
+uint8x8_t test_vceqz_u8(uint8x8_t a) {
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<8 x !u8i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// LLVM-NEXT:    ret <8 x i8> [[VCEQZ_I]]
+  return vceqz_u8(a);
+}
+
+// LLVM-LABEL: @test_vceqz_u16(
+// CIR-LABEL: @vceqz_u16(
+uint16x4_t test_vceqz_u16(uint16x4_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !u16i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<4 x !u16i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<4 x !u16i>, !cir.vector<4 x !s16i>
+
+// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// LLVM-NEXT:    ret <4 x i16> [[VCEQZ_I]]
+  return vceqz_u16(a);
+}
+
+// LLVM-LABEL: @test_vceqz_u32(
+// CIR-LABEL: @vceqz_u32(
+uint32x2_t test_vceqz_u32(uint32x2_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !u32i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<2 x !u32i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<2 x !u32i>, !cir.vector<2 x !s32i>
+
+// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// LLVM-NEXT:    ret <2 x i32> [[VCEQZ_I]]
+  return vceqz_u32(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_u8(
+// CIR-LABEL: @vceqzq_u8(
+uint8x16_t test_vceqzq_u8(uint8x16_t a) {
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<16 x !u8i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// LLVM-NEXT:    ret <16 x i8> [[VCEQZ_I]]
+  return vceqzq_u8(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_u16(
+// CIR-LABEL: @vceqzq_u16(
+uint16x8_t test_vceqzq_u16(uint16x8_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !u16i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<8 x !u16i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<8 x !u16i>, !cir.vector<8 x !s16i>
+
+// LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// LLVM-NEXT:    ret <8 x i16> [[VCEQZ_I]]
+  return vceqzq_u16(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_u32(
+// CIR-LABEL: @vceqzq_u32(
+uint32x4_t test_vceqzq_u32(uint32x4_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !u32i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<4 x !u32i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<4 x !u32i>, !cir.vector<4 x !s32i>
+
+// LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// LLVM-NEXT:    ret <4 x i32> [[VCEQZ_I]]
+  return vceqzq_u32(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_u64(
+// CIR-LABEL: @vceqzq_u64(
+uint64x2_t test_vceqzq_u64(uint64x2_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<16 x !s8i> -> !cir.vector<2 x !u64i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<2 x !u64i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<2 x !u64i>, !cir.vector<2 x !s64i>
+
+// LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// LLVM-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+  return vceqzq_u64(a);
+}
+
+// LLVM-LABEL: @test_vceqz_f32(
+// CIR-LABEL: @vceqz_f32(
+uint32x2_t test_vceqz_f32(float32x2_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !cir.float>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<2 x !cir.float>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<2 x !cir.float>, !cir.vector<2 x !s32i>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// LLVM-NEXT:    [[TMP3:%.*]] = fcmp oeq <2 x float> [[TMP2]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// LLVM-NEXT:    ret <2 x i32> [[VCEQZ_I]]
+  return vceqz_f32(a);
+}
+
+// LLVM-LABEL: @test_vceqz_f64(
+// CIR-LABEL: @vceqz_f64(
+uint64x1_t test_vceqz_f64(float64x1_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<1 x !cir.double>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<1 x !cir.double>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<1 x !cir.double>, !cir.vector<1 x !s64i>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// LLVM-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// LLVM-NEXT:    [[TMP3:%.*]] = fcmp oeq <1 x double> [[TMP2]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+// LLVM-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+  return vceqz_f64(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_f32(
+// CIR-LABEL: @vceqzq_f32(
+uint32x4_t test_vceqzq_f32(float32x4_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !cir.float>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<4 x !cir.float>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// LLVM-NEXT:    [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// LLVM-NEXT:    ret <4 x i32> [[VCEQZ_I]]
+  return vceqzq_f32(a);
+}
+
+// LLVM-LABEL: @test_vceqz_p8(
+// CIR-LABEL: @vceqz_p8(
+uint8x8_t test_vceqz_p8(poly8x8_t a) {
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<8 x !s8i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<8 x !s8i>
+
+// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// LLVM-NEXT:    ret <8 x i8> [[VCEQZ_I]]
+  return vceqz_p8(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_p8(
+// CIR-LABEL: @vceqzq_p8(
+uint8x16_t test_vceqzq_p8(poly8x16_t a) {
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<16 x !s8i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<16 x !s8i>
+
+// LLVM-SAME: <16 x i8> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// LLVM-NEXT:    ret <16 x i8> [[VCEQZ_I]]
+  return vceqzq_p8(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_f64(
+// CIR-LABEL: @vceqzq_f64(
+uint64x2_t test_vceqzq_f64(float64x2_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<16 x !s8i> -> !cir.vector<2 x !cir.double>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<2 x !cir.double>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i>
+
+// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// LLVM-NEXT:    [[TMP3:%.*]] = fcmp oeq <2 x double> [[TMP2]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+// LLVM-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+  return vceqzq_f64(a);
+}
+
+// LLVM-LABEL: @test_vceqzq_p64(
+// CIR-LABEL: @vceqzq_p64(
+uint64x2_t test_vceqzq_p64(poly64x2_t a) {
+// CIR:   cir.cast bitcast {{%.*}} : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+// CIR:   [[C_0:%.*]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+// CIR:   cir.vec.cmp(eq, {{%.*}}, [[C_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>
+
+// LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]]) #[[ATTR0]] {
+// LLVM:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// LLVM-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// LLVM-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+  return vceqzq_p64(a);
 }
 
 //===------------------------------------------------------===//

>From ea1bbc72e19c220ca6866c429659eb84ed4faa95 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 5 Mar 2026 16:15:50 +0000
Subject: [PATCH 2/2] Address PR comments

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 6 +++---
 clang/lib/CIR/FrontendAction/CIRGenAction.cpp  | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index d495b02eb7fbd..693845e09e82e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -97,7 +97,7 @@ struct ARMVectorIntrinsicInfo {
   {#NameBase, NEON::BI__builtin_neon_##NameBase, Intrinsic::LLVMIntrinsic,     \
    Intrinsic::AltLLVMIntrinsic, TypeModifier}
 
-static const armVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
+static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
     NEONMAP0(splat_lane_v),
     NEONMAP0(splat_laneq_v),
     NEONMAP0(splatq_lane_v),
@@ -415,7 +415,7 @@ static const armVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
 
 #define SVEMAP2(NameBase, TypeModifier)                                        \
   {#NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier}
-static const armVectorIntrinsicInfo aarch64SVEIntrinsicMap[] = {
+static const ARMVectorIntrinsicInfo aarch64SVEIntrinsicMap[] = {
 #define GET_SVE_LLVM_INTRINSIC_MAP
 #include "clang/Basic/arm_sve_builtin_cg.inc"
 #undef GET_SVE_LLVM_INTRINSIC_MAP
@@ -2233,7 +2233,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
 
   // Not all intrinsics handled by the common case work for AArch64 yet, so only
   // defer to common code if it's been added to our special map.
-  const armVectorIntrinsicInfo *builtin;
+  const ARMVectorIntrinsicInfo *builtin;
   builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, builtinID,
                                         aarch64SIMDIntrinsicsProvenSorted);
 
diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
index b15d7f699119c..19c407545b961 100644
--- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
+++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
@@ -102,8 +102,6 @@ class CIRGenConsumer : public clang::ASTConsumer {
 
     if (!FEOptions.ClangIRDisableCIRVerifier) {
       if (!Gen->verifyModule()) {
-        // HACK!!
-        Gen->getModule().dump();
         CI.getDiagnostics().Report(
             diag::err_cir_verification_failed_pre_passes);
         llvm::report_fatal_error(



More information about the llvm-branch-commits mailing list