[clang] [CIR][AArch64] Add lowering for vaba_* and vabd_* builtins (PR #183595)

Tue Mar 3 05:25:44 PST 2026

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/183595

>From b153fe01898695de005ea028134a840aa0dfd8c3 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 26 Feb 2026 18:44:15 +0000
Subject: [PATCH 1/4] [CIR][AArch64] Add lowering for vaba_* and vabd_*
 builtins (1/N)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add CIR lowering for the following AdvSIMD (NEON) intrinsic families:

* vabd_*  – Absolute difference
  https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#absolute-difference

* vaba_*  – Absolute difference and accumulate
  https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#absolute-difference-and-accumulate

Tests for these intrinsics were split out from:
  test/CodeGen/AArch64/neon-intrinsics.c

and moved to:
  test/CodeGen/AArch64/neon/intrinsics.c

The following helper hooks were adapted from the ClangIR project:
  * getNeonType, emitNeonCall, emitNeonCallToOp.

NOTE: Quad-word variants (e.g. vabaq_*) are not included in this change
and will be added in a follow-up patch.

Credit to the ClangIR contributors for the original implementation.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |   3 +
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 137 +++++++++++
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 174 --------------
 clang/test/CodeGen/AArch64/neon/intrinsics.c  | 213 ++++++++++++++++++
 4 files changed, 353 insertions(+), 174 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index efae3d9d894ed..fb96050964fcc 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -349,6 +349,9 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
                            mlir::IntegerAttr align = {},
                            cir::SyncScopeKindAttr scope = {},
                            cir::MemOrderAttr order = {}) {
+    if (mlir::cast<cir::PointerType>(dst.getType()).getPointee() !=
+        val.getType())
+      dst = createPtrBitcast(dst, val.getType());
     return cir::StoreOp::create(*this, loc, val, dst, isVolatile, align, scope,
                                 order);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 5129aa75f8f8d..6c7a4fc3edd72 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -290,6 +290,119 @@ static bool hasExtraNeonArgument(unsigned builtinID) {
   return mask != 0;
 }
 
+// TODO: Remove `loc` from the list of arguments once all NYIs are gone.
+static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
+                                   mlir::Location loc,
+                                   bool hasLegalHalfType = true,
+                                   bool v1Ty = false,
+                                   bool allowBFloatArgsAndRet = true) {
+  int isQuad = typeFlags.isQuad();
+  switch (typeFlags.getEltType()) {
+  case NeonTypeFlags::Int8:
+  case NeonTypeFlags::Poly8:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt8Ty
+                                                       : cgf->sInt8Ty,
+                                v1Ty ? 1 : (8 << isQuad));
+  case NeonTypeFlags::MFloat8:
+    cgf->getCIRGenModule().errorNYI(
+        loc, std::string("unimplemented NEON type: MFloat8"));
+    [[fallthrough]];
+  case NeonTypeFlags::Int16:
+  case NeonTypeFlags::Poly16:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt16Ty
+                                                       : cgf->sInt16Ty,
+                                v1Ty ? 1 : (4 << isQuad));
+  case NeonTypeFlags::BFloat16:
+    if (allowBFloatArgsAndRet)
+      cgf->getCIRGenModule().errorNYI(
+          loc, std::string("unimplemented NEON type: BFloat16"));
+    else
+      cgf->getCIRGenModule().errorNYI(
+          loc, std::string("unimplemented NEON type: BFloat16"));
+    [[fallthrough]];
+  case NeonTypeFlags::Float16:
+    if (hasLegalHalfType)
+      cgf->getCIRGenModule().errorNYI(
+          loc, std::string("unimplemented NEON type: Float16"));
+    else
+      cgf->getCIRGenModule().errorNYI(
+          loc, std::string("unimplemented NEON type: Float16"));
+    [[fallthrough]];
+  case NeonTypeFlags::Int32:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
+                                                       : cgf->sInt32Ty,
+                                v1Ty ? 1 : (2 << isQuad));
+  case NeonTypeFlags::Int64:
+  case NeonTypeFlags::Poly64:
+    return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt64Ty
+                                                       : cgf->sInt64Ty,
+                                v1Ty ? 1 : (1 << isQuad));
+  case NeonTypeFlags::Poly128:
+    // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
+    // There is a lot of i128 and f128 API missing.
+    // so we use v16i8 to represent poly128 and get pattern matched.
+    cgf->getCIRGenModule().errorNYI(
+        loc, std::string("unimplemented NEON type: Poly128"));
+    [[fallthrough]];
+  case NeonTypeFlags::Float32:
+    return cir::VectorType::get(cgf->getCIRGenModule().floatTy,
+                                v1Ty ? 1 : (2 << isQuad));
+  case NeonTypeFlags::Float64:
+    return cir::VectorType::get(cgf->getCIRGenModule().doubleTy,
+                                v1Ty ? 1 : (1 << isQuad));
+  }
+  llvm_unreachable("Unknown vector element type!");
+}
+
+template <typename Operation>
+static mlir::Value emitNeonCallToOp(
+    CIRGenBuilderTy &builder, llvm::SmallVector<mlir::Type> argTypes,
+    llvm::SmallVectorImpl<mlir::Value> &args,
+    std::optional<llvm::StringRef> intrinsicName, mlir::Type funcResTy,
+    mlir::Location loc, bool isConstrainedFPIntrinsic = false,
+    unsigned shift = 0, bool rightshift = false) {
+  // TODO: Consider removing the following unreachable when we have
+  // emitConstrainedFPCall feature implemented
+  assert(!cir::MissingFeatures::emitConstrainedFPCall());
+  if (isConstrainedFPIntrinsic)
+    llvm_unreachable("isConstrainedFPIntrinsic NYI");
+
+  for (unsigned j = 0; j < argTypes.size(); ++j) {
+    if (isConstrainedFPIntrinsic) {
+      assert(!cir::MissingFeatures::emitConstrainedFPCall());
+    }
+    if (shift > 0 && shift == j) {
+      llvm_unreachable("shift NYI");
+    } else {
+      args[j] = builder.createBitcast(args[j], argTypes[j]);
+    }
+  }
+  if (isConstrainedFPIntrinsic) {
+    assert(!cir::MissingFeatures::emitConstrainedFPCall());
+    return nullptr;
+  }
+  if constexpr (std::is_same_v<Operation, cir::LLVMIntrinsicCallOp>) {
+    return Operation::create(builder, loc,
+                             builder.getStringAttr(intrinsicName.value()),
+                             funcResTy, args)
+        .getResult();
+  } else {
+    return Operation::create(builder, loc, funcResTy, args).getResult();
+  }
+}
+
+static mlir::Value emitNeonCall(CIRGenBuilderTy &builder,
+                                llvm::SmallVector<mlir::Type> argTypes,
+                                llvm::SmallVectorImpl<mlir::Value> &args,
+                                llvm::StringRef intrinsicName,
+                                mlir::Type funcResTy, mlir::Location loc,
+                                bool isConstrainedFPIntrinsic = false,
+                                unsigned shift = 0, bool rightshift = false) {
+  return emitNeonCallToOp<cir::LLVMIntrinsicCallOp>(
+      builder, std::move(argTypes), args, intrinsicName, funcResTy, loc,
+      isConstrainedFPIntrinsic, shift, rightshift);
+}
+
 std::optional<mlir::Value>
 CIRGenFunction::emitAArch64SVEBuiltinExpr(unsigned builtinID,
                                           const CallExpr *expr) {
@@ -1454,6 +1567,16 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
 
   assert(!cir::MissingFeatures::aarch64TblBuiltinExpr());
 
+  const Expr *arg = expr->getArg(expr->getNumArgs() - 1);
+  NeonTypeFlags type(0);
+  // A trailing constant integer is used for discriminating overloaded builtin
+  // calls. Use it to determine the type of this overloaded NEON intrinsic.
+  if (std::optional<llvm::APSInt> result =
+          arg->getIntegerConstantExpr(getContext()))
+    type = NeonTypeFlags(result->getZExtValue());
+
+  bool usgn = type.isUnsigned();
+
   mlir::Location loc = getLoc(expr->getExprLoc());
 
   // Handle non-overloaded intrinsics first.
@@ -1678,6 +1801,12 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  cir::VectorType ty = getNeonType(this, type, loc);
+  if (!ty)
+    return nullptr;
+
+  llvm::StringRef intrName;
+
   switch (builtinID) {
   default:
     return std::nullopt;
@@ -1700,7 +1829,15 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vmin_v:
   case NEON::BI__builtin_neon_vminq_v:
   case NEON::BI__builtin_neon_vminh_f16:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case NEON::BI__builtin_neon_vabd_v:
+    intrName = usgn ? "aarch64.neon.uabd" : "aarch64.neon.sabd";
+    if (cir::isFPOrVectorOfFPType(ty))
+      intrName = "aarch64.neon.fabd";
+    return emitNeonCall(builder, {ty, ty}, ops, intrName, ty, loc);
   case NEON::BI__builtin_neon_vabdq_v:
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v:
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 42799d27bba89..909d00630b069 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -1038,88 +1038,6 @@ float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
   return vdiv_f32(v1, v2);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_s8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]]
-// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
-//
-int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  return vaba_s8(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_s16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
-//
-int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  return vaba_s16(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_s32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
-//
-int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  return vaba_s32(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_u8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]]
-// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
-//
-uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  return vaba_u8(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_u16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
-//
-uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  return vaba_u16(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_u32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
-//
-uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  return vaba_u32(v1, v2, v3);
-}
-
 // CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_s8(
 // CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -1202,98 +1120,6 @@ uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
   return vabaq_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_s8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
-// CHECK-NEXT:    ret <8 x i8> [[VABD_I]]
-//
-int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
-  return vabd_s8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_s16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
-// CHECK-NEXT:    ret <4 x i16> [[VABD2_I]]
-//
-int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
-  return vabd_s16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_s32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
-// CHECK-NEXT:    ret <2 x i32> [[VABD2_I]]
-//
-int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
-  return vabd_s32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_u8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
-// CHECK-NEXT:    ret <8 x i8> [[VABD_I]]
-//
-uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
-  return vabd_u8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_u16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
-// CHECK-NEXT:    ret <4 x i16> [[VABD2_I]]
-//
-uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
-  return vabd_u16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_u32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
-// CHECK-NEXT:    ret <2 x i32> [[VABD2_I]]
-//
-uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
-  return vabd_u32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x float> @test_vabd_f32(
-// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]])
-// CHECK-NEXT:    ret <2 x float> [[VABD2_I]]
-//
-float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
-  return vabd_f32(v1, v2);
-}
-
 // CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_s8(
 // CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index 039a08c23852e..227b23f532fe1 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -42,3 +42,216 @@ int64_t test_vnegd_s64(int64_t a) {
 // LLVM-NEXT:     ret i64 [[VNEGD_I]]
   return (int64_t)vnegd_s64(a);
 }
+
+//===------------------------------------------------------===//
+// 2.1.1.6.1. Absolute difference
+//===------------------------------------------------------===//
+// LLVM-LABEL: @test_vabd_s8(
+// CIR-LABEL: @vabd_s8(
+int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
+// CIR: cir.call_llvm_intrinsic "aarch64.neon.sabd" %{{.*}}, %{{.*}} : (!cir.vector<8 x !s8i>, !cir.vector<8 x !s8i>) -> !cir.vector<8 x !s8i>
+
+// LLVM-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) 
+// LLVM:         [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// LLVM-NEXT:    ret <8 x i8> [[VABD_I]]
+  return vabd_s8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabd_s16(
+// CIR-LABEL: @vabd_s16(
+int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.sabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// LLVM-NEXT:    ret <4 x i16> [[VABD2_I]]
+  return vabd_s16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabd_s32(
+// CIR-LABEL: @vabd_s32(
+int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.sabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// LLVM-NEXT:    ret <2 x i32> [[VABD2_I]]
+  return vabd_s32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabd_u8(
+// CIR-LABEL: @vabd_u8(
+uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
+// CIR: cir.call_llvm_intrinsic "aarch64.neon.uabd" %{{.*}}, %{{.*}} : (!cir.vector<8 x !u8i>, !cir.vector<8 x !u8i>) -> !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) 
+// LLVM:         [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// LLVM-NEXT:    ret <8 x i8> [[VABD_I]]
+  return vabd_u8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabd_u16(
+// CIR-LABEL: @vabd_u16(
+uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !u16i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !u16i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.uabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// LLVM-NEXT:    ret <4 x i16> [[VABD2_I]]
+  return vabd_u16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabd_u32(
+// CIR-LABEL: @vabd_u32(
+uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !u32i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !u32i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.uabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// LLVM-NEXT:    ret <2 x i32> [[VABD2_I]]
+  return vabd_u32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabd_f32(
+// CIR-LABEL: @vabd_f32(
+float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !cir.float>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !cir.float>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.fabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// LLVM-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]])
+// LLVM-NEXT:    ret <2 x float> [[VABD2_I]]
+  return vabd_f32(v1, v2);
+}
+
+//===------------------------------------------------------===//
+// 2.1.1.6.3. Absolute difference and accumulate
+//
+// The following builtins expand to a call to vabd_{} builtins,
+// which is reflected in the CIR output.
+//===------------------------------------------------------===//
+
+// LLVM-LABEL: @test_vaba_u8(
+// CIR-LABEL: @vaba_u8(
+uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabd_u8
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]])
+// LLVM:         [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]]
+// LLVM-NEXT:    ret <8 x i8> [[ADD_I]]
+  return vaba_u8(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vaba_u16(
+// CIR-LABEL: @vaba_u16(
+uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabd_u16
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]])
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <4 x i16> [[ADD_I]]
+  return vaba_u16(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vaba_u32(
+// CIR-LABEL: @vaba_u32(
+uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabd_u32
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <2 x i32> [[ADD_I]]
+  return vaba_u32(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vaba_s8(
+// CIR-LABEL: @vaba_s8(
+int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabd_s8
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) 
+// LLVM:         [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]]
+// LLVM-NEXT:    ret <8 x i8> [[ADD_I]]
+  return vaba_s8(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vaba_s16(
+// CIR-LABEL: @vaba_s16(
+int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabd_s16
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <4 x i16> [[ADD_I]]
+  return vaba_s16(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vaba_s32(
+// CIR-LABEL: @vaba_s32(
+int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabd_s32
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]])
+// LLVM:         [[TMP0:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <2 x i32> [[ADD_I]]
+  return vaba_s32(v1, v2, v3);
+}

>From 53dc48fc93b5acc0c6a28f45c751e44b6728daed Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 27 Feb 2026 17:38:14 +0000
Subject: [PATCH 2/4] Add quad-ward variants

---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |   2 +-
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 190 --------------
 clang/test/CodeGen/AArch64/neon/intrinsics.c  | 247 ++++++++++++++++++
 3 files changed, 248 insertions(+), 191 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 6c7a4fc3edd72..2f1c5f35d20d4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1834,11 +1834,11 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
   case NEON::BI__builtin_neon_vabd_v:
+  case NEON::BI__builtin_neon_vabdq_v:
     intrName = usgn ? "aarch64.neon.uabd" : "aarch64.neon.sabd";
     if (cir::isFPOrVectorOfFPType(ty))
       intrName = "aarch64.neon.fabd";
     return emitNeonCall(builder, {ty, ty}, ops, intrName, ty, loc);
-  case NEON::BI__builtin_neon_vabdq_v:
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v:
   case NEON::BI__builtin_neon_vpmin_v:
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 909d00630b069..33b0b6bc55426 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -1038,196 +1038,6 @@ float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
   return vdiv_f32(v1, v2);
 }
 
-// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_s8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]]
-// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
-//
-int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  return vabaq_s8(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_s16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
-//
-int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  return vabaq_s16(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_s32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
-//
-int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  return vabaq_s32(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_u8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]]
-// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
-//
-uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  return vabaq_u8(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_u16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
-//
-uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  return vabaq_u16(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_u32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]]
-// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
-//
-uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  return vabaq_u32(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_s8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
-// CHECK-NEXT:    ret <16 x i8> [[VABD_I]]
-//
-int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
-  return vabdq_s8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_s16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
-// CHECK-NEXT:    ret <8 x i16> [[VABD2_I]]
-//
-int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
-  return vabdq_s16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_s32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
-// CHECK-NEXT:    ret <4 x i32> [[VABD2_I]]
-//
-int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
-  return vabdq_s32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_u8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
-// CHECK-NEXT:    ret <16 x i8> [[VABD_I]]
-//
-uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
-  return vabdq_u8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_u16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
-// CHECK-NEXT:    ret <8 x i16> [[VABD2_I]]
-//
-uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
-  return vabdq_u16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_u32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
-// CHECK-NEXT:    ret <4 x i32> [[VABD2_I]]
-//
-uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
-  return vabdq_u32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x float> @test_vabdq_f32(
-// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]])
-// CHECK-NEXT:    ret <4 x float> [[VABD2_I]]
-//
-float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
-  return vabdq_f32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x double> @test_vabdq_f64(
-// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]])
-// CHECK-NEXT:    ret <2 x double> [[VABD2_I]]
-//
-float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
-  return vabdq_f64(v1, v2);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s8(
 // CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index 227b23f532fe1..783322db33f55 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -155,6 +155,159 @@ float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
   return vabd_f32(v1, v2);
 }
 
+// LLVM-LABEL: @test_vabd_f64(
+// CIR-LABEL: @vabd_f64(
+float64x1_t test_vabd_f64(float64x1_t v1, float64x1_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<1 x !cir.double>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<1 x !cir.double>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.fabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <1 x double> noundef [[V1:%.*]], <1 x double> noundef [[V2:%.*]])
+// LLVM:         [[TMP0:%.*]] = bitcast <1 x double> [[V1]] to i64
+// LLVM-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[V2]] to i64
+// LLVM-NEXT:    [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// LLVM-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[VABD_I]], <1 x double> [[VABD1_I]])
+// LLVM-NEXT:    ret <1 x double> [[VABD2_I]]
+  return vabd_f64(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_s8(
+// CIR-LABEL: @vabdq_s8(
+int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
+// CIR: cir.call_llvm_intrinsic "aarch64.neon.sabd" %{{.*}}, %{{.*}} : (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>) -> !cir.vector<16 x !s8i>
+
+// LLVM-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) 
+// LLVM:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// LLVM-NEXT:    ret <16 x i8> [[VABD_I]]
+  return vabdq_s8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_s16(
+// CIR-LABEL: @vabdq_s16(
+int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.sabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
+// LLVM-NEXT:    ret <8 x i16> [[VABD2_I]]
+  return vabdq_s16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_s32(
+// CIR-LABEL: @vabdq_s32(
+int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !s32i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !s32i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.sabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
+// LLVM-NEXT:    ret <4 x i32> [[VABD2_I]]
+  return vabdq_s32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_u8(
+// CIR-LABEL: @vabdq_u8(
+uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
+// CIR: cir.call_llvm_intrinsic "aarch64.neon.uabd" %{{.*}}, %{{.*}} : (!cir.vector<16 x !u8i>, !cir.vector<16 x !u8i>) -> !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) 
+// LLVM:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// LLVM-NEXT:    ret <16 x i8> [[VABD_I]]
+  return vabdq_u8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_u16(
+// CIR-LABEL: @vabdq_u16(
+uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !u16i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !u16i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.uabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
+// LLVM-NEXT:    ret <8 x i16> [[VABD2_I]]
+  return vabdq_u16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_u32(
+// CIR-LABEL: @vabdq_u32(
+uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !u32i>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !u32i>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.uabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
+// LLVM-NEXT:    ret <4 x i32> [[VABD2_I]]
+  return vabdq_u32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_f32(
+// CIR-LABEL: @vabdq_f32(
+float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !cir.float>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !cir.float>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.fabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// LLVM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]])
+// LLVM-NEXT:    ret <4 x float> [[VABD2_I]]
+  return vabdq_f32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vabdq_f64(
+// CIR-LABEL: @vabdq_f64(
+float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
+// CIR:   [[V1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<2 x !cir.double>
+// CIR:   [[V2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<2 x !cir.double>
+// CIR:   cir.call_llvm_intrinsic "aarch64.neon.fabd" [[V1]], [[V2]]
+
+// LLVM-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
+// LLVM-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+// LLVM-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]])
+// LLVM-NEXT:    ret <2 x double> [[VABD2_I]]
+  return vabdq_f64(v1, v2);
+}
+
+// TODO SISD variants:
+// TODO @vabdd_f64(a, b);
+// TODO @test_vabds_f32(
+
 //===------------------------------------------------------===//
 // 2.1.1.6.3. Absolute difference and accumulate
 //
@@ -255,3 +408,97 @@ int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
 // LLVM-NEXT:    ret <2 x i32> [[ADD_I]]
   return vaba_s32(v1, v2, v3);
 }
+
+// LLVM-LABEL: @test_vabaq_s8(
+// CIR-LABEL: @vabaq_s8(
+int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabdq_s8
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) 
+// LLVM:         [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]]
+// LLVM-NEXT:    ret <16 x i8> [[ADD_I]]
+  return vabaq_s8(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vabaq_s16(
+// CIR-LABEL: @vabaq_s16(
+int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabdq_s16
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <8 x i16> [[ADD_I]]
+  return vabaq_s16(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vabaq_s32(
+// CIR-LABEL: @vabaq_s32(
+int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabdq_s32
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <4 x i32> [[ADD_I]]
+  return vabaq_s32(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vabaq_u8(
+// CIR-LABEL: @vabaq_u8(
+uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabdq_u8
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) 
+// LLVM:         [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]]
+// LLVM-NEXT:    ret <16 x i8> [[ADD_I]]
+  return vabaq_u8(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vabaq_u16(
+// CIR-LABEL: @vabaq_u16(
+uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabdq_u16
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <8 x i16> [[ADD_I]]
+  return vabaq_u16(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vabaq_u32(
+// CIR-LABEL: @vabaq_u32(
+uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
+// CIR: [[ABD:%.*]] = cir.call @vabdq_u32
+// CIR: [[RES:%.*]] = cir.binop(add, {{.*}}, [[ABD]])
+
+// LLVM-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) 
+// LLVM:         [[TMP0:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
+// LLVM-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8>
+// LLVM-NEXT:    [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// LLVM-NEXT:    [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// LLVM-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]])
+// LLVM-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]]
+// LLVM-NEXT:    ret <4 x i32> [[ADD_I]]
+  return vabaq_u32(v1, v2, v3);
+}

>From e8d6c405bdabba36338abbfe24644bb33b83107b Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Tue, 3 Mar 2026 09:30:12 +0000
Subject: [PATCH 3/4] Address PR comments

---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 2f1c5f35d20d4..f734752f7cff8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -290,7 +290,7 @@ static bool hasExtraNeonArgument(unsigned builtinID) {
   return mask != 0;
 }
 
-// TODO: Remove `loc` from the list of arguments once all NYIs are gone.
+// TODO(cir): Remove `loc` from the list of arguments once all NYIs are gone.
 static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
                                    mlir::Location loc,
                                    bool hasLegalHalfType = true,
@@ -354,25 +354,29 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
   llvm_unreachable("Unknown vector element type!");
 }
 
+// TODO(cir): Remove `cgm` from the list of arguments once all NYI(s) are gone.
 template <typename Operation>
-static mlir::Value emitNeonCallToOp(
-    CIRGenBuilderTy &builder, llvm::SmallVector<mlir::Type> argTypes,
-    llvm::SmallVectorImpl<mlir::Value> &args,
-    std::optional<llvm::StringRef> intrinsicName, mlir::Type funcResTy,
-    mlir::Location loc, bool isConstrainedFPIntrinsic = false,
-    unsigned shift = 0, bool rightshift = false) {
-  // TODO: Consider removing the following unreachable when we have
+static mlir::Value
+emitNeonCallToOp(CIRGenModule &cgm, CIRGenBuilderTy &builder,
+                 llvm::SmallVector<mlir::Type> argTypes,
+                 llvm::SmallVectorImpl<mlir::Value> &args,
+                 std::optional<llvm::StringRef> intrinsicName,
+                 mlir::Type funcResTy, mlir::Location loc,
+                 bool isConstrainedFPIntrinsic = false, unsigned shift = 0,
+                 bool rightshift = false) {
+  // TODO(cir): Consider removing the following unreachable when we have
   // emitConstrainedFPCall feature implemented
   assert(!cir::MissingFeatures::emitConstrainedFPCall());
   if (isConstrainedFPIntrinsic)
-    llvm_unreachable("isConstrainedFPIntrinsic NYI");
+    cgm.errorNYI(loc, std::string("unimplemented constrained FP intrinsic"));
 
   for (unsigned j = 0; j < argTypes.size(); ++j) {
     if (isConstrainedFPIntrinsic) {
       assert(!cir::MissingFeatures::emitConstrainedFPCall());
     }
     if (shift > 0 && shift == j) {
-      llvm_unreachable("shift NYI");
+      cgm.errorNYI(loc,
+                   std::string("unimplemented intrinsic requiring a shift Op"));
     } else {
       args[j] = builder.createBitcast(args[j], argTypes[j]);
     }
@@ -391,7 +395,8 @@ static mlir::Value emitNeonCallToOp(
   }
 }
 
-static mlir::Value emitNeonCall(CIRGenBuilderTy &builder,
+// TODO(cir): Remove `cgm` from the list of arguments once all NYI(s) are gone.
+static mlir::Value emitNeonCall(CIRGenModule &cgm, CIRGenBuilderTy &builder,
                                 llvm::SmallVector<mlir::Type> argTypes,
                                 llvm::SmallVectorImpl<mlir::Value> &args,
                                 llvm::StringRef intrinsicName,
@@ -399,7 +404,7 @@ static mlir::Value emitNeonCall(CIRGenBuilderTy &builder,
                                 bool isConstrainedFPIntrinsic = false,
                                 unsigned shift = 0, bool rightshift = false) {
   return emitNeonCallToOp<cir::LLVMIntrinsicCallOp>(
-      builder, std::move(argTypes), args, intrinsicName, funcResTy, loc,
+      cgm, builder, std::move(argTypes), args, intrinsicName, funcResTy, loc,
       isConstrainedFPIntrinsic, shift, rightshift);
 }
 
@@ -1838,7 +1843,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     intrName = usgn ? "aarch64.neon.uabd" : "aarch64.neon.sabd";
     if (cir::isFPOrVectorOfFPType(ty))
       intrName = "aarch64.neon.fabd";
-    return emitNeonCall(builder, {ty, ty}, ops, intrName, ty, loc);
+    return emitNeonCall(cgm, builder, {ty, ty}, ops, intrName, ty, loc);
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v:
   case NEON::BI__builtin_neon_vpmin_v:

>From 3b20f09cf73a89faad15401c13e1904dac153d7f Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Tue, 3 Mar 2026 13:25:10 +0000
Subject: [PATCH 4/4] Tweak NYI msgs

---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 23 +++++++------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index f734752f7cff8..df85ba7186775 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -304,8 +304,7 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
                                                        : cgf->sInt8Ty,
                                 v1Ty ? 1 : (8 << isQuad));
   case NeonTypeFlags::MFloat8:
-    cgf->getCIRGenModule().errorNYI(
-        loc, std::string("unimplemented NEON type: MFloat8"));
+    cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: MFloat8"));
     [[fallthrough]];
   case NeonTypeFlags::Int16:
   case NeonTypeFlags::Poly16:
@@ -314,19 +313,15 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
                                 v1Ty ? 1 : (4 << isQuad));
   case NeonTypeFlags::BFloat16:
     if (allowBFloatArgsAndRet)
-      cgf->getCIRGenModule().errorNYI(
-          loc, std::string("unimplemented NEON type: BFloat16"));
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     else
-      cgf->getCIRGenModule().errorNYI(
-          loc, std::string("unimplemented NEON type: BFloat16"));
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     [[fallthrough]];
   case NeonTypeFlags::Float16:
     if (hasLegalHalfType)
-      cgf->getCIRGenModule().errorNYI(
-          loc, std::string("unimplemented NEON type: Float16"));
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
     else
-      cgf->getCIRGenModule().errorNYI(
-          loc, std::string("unimplemented NEON type: Float16"));
+      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
     [[fallthrough]];
   case NeonTypeFlags::Int32:
     return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
@@ -341,8 +336,7 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
     // There is a lot of i128 and f128 API missing.
     // so we use v16i8 to represent poly128 and get pattern matched.
-    cgf->getCIRGenModule().errorNYI(
-        loc, std::string("unimplemented NEON type: Poly128"));
+    cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Poly128"));
     [[fallthrough]];
   case NeonTypeFlags::Float32:
     return cir::VectorType::get(cgf->getCIRGenModule().floatTy,
@@ -368,15 +362,14 @@ emitNeonCallToOp(CIRGenModule &cgm, CIRGenBuilderTy &builder,
   // emitConstrainedFPCall feature implemented
   assert(!cir::MissingFeatures::emitConstrainedFPCall());
   if (isConstrainedFPIntrinsic)
-    cgm.errorNYI(loc, std::string("unimplemented constrained FP intrinsic"));
+    cgm.errorNYI(loc, std::string("constrained FP intrinsic"));
 
   for (unsigned j = 0; j < argTypes.size(); ++j) {
     if (isConstrainedFPIntrinsic) {
       assert(!cir::MissingFeatures::emitConstrainedFPCall());
     }
     if (shift > 0 && shift == j) {
-      cgm.errorNYI(loc,
-                   std::string("unimplemented intrinsic requiring a shift Op"));
+      cgm.errorNYI(loc, std::string("intrinsic requiring a shift Op"));
     } else {
       args[j] = builder.createBitcast(args[j], argTypes[j]);
     }