[clang] [CIR][NEON] Add lowering support for `vceqzd_s64` (PR #179779)

Fri Feb 6 10:27:44 PST 2026

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/179779

>From 5711026c2f77780fa4e7eae9d57b7ee590a873ce Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 4 Feb 2026 19:45:47 +0000
Subject: [PATCH 1/3] [CIR][NEON] Add lowering support for `vceqzd_s64`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SUMMARY
-------

Add support in the CIR lowering path for the NEON intrinsic
`vceqzd_s64`. This ensures the mask produced by the intrinsic is
sign-extended (not zero-extended) when lowering from scalar inputs,
matching vector cmp semantics.

IMPLEMENTATION
--------------

Problem: When lowering the scalar form of `vceqzd_s64` the default integer
compare and extension sequence produces a zero-extended result for
scalars, while vector cmp results are treated as masks and become
sign-extended. The default scalar lowering would be incorrect:
```llvm
  %11 = icmp eq i64 %10, zeroinitializer
  %12 = zext i1 %11 to i64
```

Instead, the scalar input is treated as 1-element vector to guarantee
sign-extensios:

```llvm
  %10 = insertelement <1 x i64> poison, i64 %9, i64 0
  %11 = icmp eq <1 x i64> %10, zeroinitializer
  %12 = sext <1 x i1> %11 to <1 x i64>
  %13 = extractelement <1 x i64> %12, i64 0
```

The above sequence can be cleaned up with `opt --passes=instcombine`
yielding:
```llvm
  %11 = icmp eq i64 %10, zeroinitializer
  %12 = sext i1 %11 to i64
```

To avoid using 1-element vectors, CIR would need a dedicated
`cir::SExtOp` to allow explicit control over the extension kind.

TESTING
-------
Rather than creating a dedicated ClangIR test file, the original test
file for this intrinsic is effectively reused:

  * clang/test/CodeGen/AArch64/neon-intrinsics.c

“Effectively” meaning that the corresponding test is moved (rather than
literally reused) to a new file within the original AArch64 builtins
test directory:

  * clang/test/CodeGen/AArch64/neon/intrinsics.c

This is necessary to avoid lowering unsupported examples from
intrinsics.c with `-fclangir`. The new file will eventually replace the
original one once all builtins from it can be lowered via ClangIR.

TESTING INFRA
-------------
New LIT "feature" is added so that CIR tests can be run conditionally,
e.g. the following will only run when `CLANG_ENABLE_CIR` is set:
```C
// RUN: %if cir %{%clang_cc1 ... %}
```

REFERENCES
----------
(*) https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]&q=vceqzd_s64
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 56 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  8 +++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  1 +
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 11 +++-
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 11 ----
 clang/test/CodeGen/AArch64/neon/intrinsics.c  | 39 +++++++++++++
 clang/test/lit.cfg.py                         |  2 +
 7 files changed, 114 insertions(+), 14 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/neon/intrinsics.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index a8ede132f4eca..d785d776ab7b4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -88,6 +88,54 @@ findARMVectorIntrinsicInMap(ArrayRef<AArch64BuiltinInfo> intrinsicMap,
   return nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+//  Emit-helpers
+//===----------------------------------------------------------------------===//
+mlir::Value CIRGenFunction::emitAArch64CompareBuiltinExpr(
+    mlir::Location loc, mlir::Value src, mlir::Type ty,
+    const llvm::CmpInst::Predicate pred) {
+
+  mlir::Value res;
+  if (isa<cir::VectorType>(ty) && !cast<cir::VectorType>(ty).getIsScalable()) {
+    // Vector types are cast to i8 vectors. Recover original type.
+    cgm.errorNYI(loc, std::string("unimplemented vector compare"));
+  }
+
+  // Scalar compare is a special case that is artifically converted to a
+  // 1-element vector compare. This is to guarantee that the output result is
+  // sign- rather than zero-extended.
+  //
+  // Specifically, a compare Op will generate an i1 result that needs to be
+  // extended to match the in/out type, `ty`. Regular scalar cast wwould lead
+  // to ZExt to preserve the value, e.g. 0b1 --> 0x00000001 (i1 -0> i16).
+  // Vector compare are meant to generate masks and these are exteded via SExt,
+  // so that 0b1 --> 0x11111111 and 0b0 --> 0x00000000.
+  bool scalarInputs = isa<cir::IntType>(src.getType());
+
+  mlir::Value zero = builder.getNullValue(ty, loc);
+  if (CmpInst::isFPPredicate(pred)) {
+    cgm.errorNYI(loc, std::string("unimplemented FP compare"));
+    // TODO:
+    //   if (Pred == CmpInst::FCMP_OEQ)
+    //   else
+  } else {
+    if (scalarInputs) {
+      cir::VectorType ty = cir::VectorType::get(src.getType(), 1, false);
+      src = cir::VecSplatOp::create(builder, loc, ty, src);
+      zero = cir::VecSplatOp::create(builder, loc, ty, zero);
+    }
+  }
+
+  mlir::Value cmp =
+      builder.createVecCompare(loc, cir::CmpOpKind::eq, src, zero);
+
+  if (scalarInputs)
+    cmp =
+        cir::VecExtractOp::create(builder, loc, cmp, builder.getUInt64(0, loc));
+
+  return builder.createBitcast(loc, cmp, builder.getUInt64Ty());
+}
+
 bool CIRGenFunction::getAArch64SVEProcessedOperands(
     unsigned builtinID, const CallExpr *expr, SmallVectorImpl<mlir::Value> &ops,
     SVETypeFlags typeFlags) {
@@ -1279,7 +1327,15 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vpaddd_s64:
   case NEON::BI__builtin_neon_vpaddd_f64:
   case NEON::BI__builtin_neon_vpadds_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case NEON::BI__builtin_neon_vceqzd_s64:
+    ops.push_back(emitScalarExpr(expr->getArg(0)));
+    return emitAArch64CompareBuiltinExpr(
+        loc, ops[0], convertType(expr->getArg(0)->getType()),
+        llvm::ICmpInst::ICMP_EQ);
   case NEON::BI__builtin_neon_vceqzd_f64:
   case NEON::BI__builtin_neon_vceqzs_f32:
   case NEON::BI__builtin_neon_vceqzh_f16:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index adcf4d56e3892..1008826b98951 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -36,6 +36,7 @@
 #include "clang/CIR/MissingFeatures.h"
 #include "clang/CIR/TypeEvaluationKind.h"
 #include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/IR/Instructions.h"
 
 namespace {
 class ScalarExprEmitter;
@@ -1288,6 +1289,13 @@ class CIRGenFunction : public CIRGenTypeCache {
                                       SourceLocation assumptionLoc,
                                       int64_t alignment,
                                       mlir::Value offsetValue = nullptr);
+  /// -----------------------------
+  /// CIR emit functions -- AArch64
+  /// -----------------------------
+  mlir::Value
+  emitAArch64CompareBuiltinExpr(mlir::Location loc, mlir::Value src,
+                                mlir::Type ty,
+                                const llvm::CmpInst::Predicate pred);
 
 private:
   void emitAndUpdateRetAlloca(clang::QualType type, mlir::Location loc,
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 5d2a1098799bb..4c97fb877fdae 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1293,6 +1293,7 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite(
     auto llvmSrcTy = mlir::cast<mlir::IntegerType>(llvmSrcVal.getType());
     auto llvmDstTy =
         mlir::cast<mlir::IntegerType>(getTypeConverter()->convertType(dstTy));
+
     if (llvmSrcTy.getWidth() == llvmDstTy.getWidth())
       rewriter.replaceOpWithNewOp<mlir::LLVM::BitcastOp>(castOp, llvmDstTy,
                                                          llvmSrcVal);
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 2d7128bf95df2..c45e819a03855 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -1721,6 +1721,9 @@ Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
   return CGM.getIntrinsic(IntrinsicID, Tys);
 }
 
+//===----------------------------------------------------------------------===//
+//  Emit-helpers
+//===----------------------------------------------------------------------===//
 static Value *EmitCommonNeonSISDBuiltinExpr(
     CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
     SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
@@ -2494,13 +2497,15 @@ CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
     Op = Builder.CreateBitCast(Op, Ty);
   }
 
+  Constant *zero = Constant::getNullValue(Op->getType());
+
   if (CmpInst::isFPPredicate(Pred)) {
     if (Pred == CmpInst::FCMP_OEQ)
-      Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType()));
+      Op = Builder.CreateFCmp(Pred, Op, zero);
     else
-      Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType()));
+      Op = Builder.CreateFCmpS(Pred, Op, zero);
   } else {
-    Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType()));
+    Op = Builder.CreateICmp(Pred, Op, zero);
   }
 
   llvm::Type *ResTy = Ty;
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 1c628bbba483f..3fc299b926009 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -17242,17 +17242,6 @@ uint64_t test_vceqd_u64(uint64_t a, uint64_t b) {
   return (int64_t)vceqd_u64(a, b);
 }
 
-// CHECK-LABEL: define dso_local i64 @test_vceqzd_s64(
-// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[A]], 0
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK-NEXT:    ret i64 [[VCEQZ_I]]
-//
-uint64_t test_vceqzd_s64(int64_t a) {
-  return (uint64_t)vceqzd_s64(a);
-}
-
 // CHECK-LABEL: define dso_local i64 @test_vceqzd_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c
new file mode 100644
index 0000000000000..9400b28917cc6
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -0,0 +1,39 @@
+// REQUIRES: aarch64-registered-target || arm-registered-target
+
+// RUN:           %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa             | FileCheck %s --check-prefixes=LLVM
+// RUN: %if cir %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa,instcombine | FileCheck %s --check-prefixes=LLVM %}
+// RUN: %if cir %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                                           FileCheck %s --check-prefixes=CIR %}
+
+//=============================================================================
+// NOTES
+//
+// Minor differences between RUNs (e.g. presence of `noundef` attached to
+// argumens, `align` attribute attached to pointers), are matched using
+// catch-alls like {{.*}}.
+//
+// Different labels for CIR stem from an additional function call that is
+// present at the AST and CIR levels, but is inlined at the LLVM IR level.
+//
+// For `-fclangir`, `instcombine` is used to e.g. fold 1-element vectors to
+// scalars.
+//=============================================================================
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vceqzd_s64
+// CIR-LABEL: @vceqzd_s64
+uint64_t test_vceqzd_s64(int64_t a) {
+// CIR:   [[C_0:%.*]] = cir.const #cir.int<0>
+// CIR:   [[LHS:%.*]] = cir.vec.splat {{.*}} : !s64i, !cir.vector<1 x !s64i>
+// CIR:   [[RHS:%.*]] = cir.vec.splat [[C_0]] : !s64i, !cir.vector<1 x !s64i>
+// CIR:   [[CMP:%.*]] = cir.vec.cmp(eq, [[LHS]], [[RHS]]) : !cir.vector<1 x !s64i>, !cir.vector<1 x !s64i>
+// CIR:   [[C_0_1:%.*]] = cir.const #cir.int<0> : !u64i
+// CIR:   [[RES:%.*]] = cir.vec.extract [[CMP]][[[C_0_1]] : !u64i] : !cir.vector<1 x !s64i>
+// CIR:   cir.cast bitcast [[RES]] : !s64i -> !u64i
+
+// LLVM-SAME: i64{{.*}} [[A:%.*]])
+// LLVM:          [[TMP0:%.*]] = icmp eq i64 [[A]], 0
+// LLVM-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// LLVM-NEXT:    ret i64 [[VCEQZ_I]]
+  return (uint64_t)vceqzd_s64(a);
+}
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index a622f5335354a..0aeabc7e36a61 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -406,6 +406,8 @@ def calculate_arch_features(arch_string):
 if config.have_llvm_driver:
     config.available_features.add("llvm-driver")
 
+if config.clang_enable_cir:
+    config.available_features.add("cir")
 
 # Some tests perform deep recursion, which requires a larger pthread stack size
 # than the relatively low default of 192 KiB for 64-bit processes on AIX. The

>From 063b2550b25d233ddd41e7e9c219bf14b51a25dd Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 5 Feb 2026 12:15:49 +0000
Subject: [PATCH 2/3] Address PR comments

* Remove the logic to create 1-element vectors (instead, cast `cir.bool`
  to `cir.int<s, 1>`)
* Rename `cir` as `cir-enabled` (the LIT feature)
* Use `cir::CmpOpKind kind` instead of `llvm::CmpInst::Predicate`)
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  4 +-
 .../clang/CIR/Dialect/IR/CIROpsEnums.h        |  4 ++
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 61 ++++++++-----------
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  7 +--
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  3 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  4 ++
 clang/test/CodeGen/AArch64/neon/intrinsics.c  | 18 ++----
 clang/test/lit.cfg.py                         |  2 +-
 8 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 6f2538c09ec5c..bb6922db92b55 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1941,7 +1941,9 @@ def CIR_CmpOpKind : CIR_I32EnumAttr<"CmpOpKind", "compare operation kind", [
   I32EnumAttrCase<"gt", 2>,
   I32EnumAttrCase<"ge", 3>,
   I32EnumAttrCase<"eq", 4>,
-  I32EnumAttrCase<"ne", 5>
+  I32EnumAttrCase<"ne", 5>,
+  // Floating-point only predicates
+  I32EnumAttrCase<"fcmp_first", 20> // TODO: Replace with a real FP compare
 ]>;
 
 def CIR_CmpOp : CIR_Op<"cmp", [Pure, SameTypeOperands]> {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h b/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
index dbd030446a6fc..96f3a81742412 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
@@ -123,6 +123,10 @@ template <typename Int> inline bool isValidCIRAtomicOrderingCABI(Int value) {
          value <= static_cast<Int>(cir::MemOrder::SequentiallyConsistent);
 }
 
+[[maybe_unused]] static bool isFpCompare(CmpOpKind kind) {
+  return kind >= CmpOpKind::fcmp_first;
+}
+
 } // namespace cir
 
 #endif // CLANG_CIR_DIALECT_IR_CIROPSENUMS_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index d785d776ab7b4..6c1ac99e199c5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -91,49 +91,40 @@ findARMVectorIntrinsicInMap(ArrayRef<AArch64BuiltinInfo> intrinsicMap,
 //===----------------------------------------------------------------------===//
 //  Emit-helpers
 //===----------------------------------------------------------------------===//
-mlir::Value CIRGenFunction::emitAArch64CompareBuiltinExpr(
-    mlir::Location loc, mlir::Value src, mlir::Type ty,
-    const llvm::CmpInst::Predicate pred) {
-
-  mlir::Value res;
-  if (isa<cir::VectorType>(ty) && !cast<cir::VectorType>(ty).getIsScalable()) {
-    // Vector types are cast to i8 vectors. Recover original type.
+mlir::Value
+CIRGenFunction::emitAArch64CompareBuiltinExpr(mlir::Location loc,
+                                              mlir::Value src, mlir::Type retTy,
+                                              const cir::CmpOpKind kind) {
+
+  bool scalarCmp = !isa<cir::VectorType>(src.getType());
+  if (!scalarCmp) {
+    assert(cast<cir::VectorType>(retTy).getIsScalable() &&
+           "This is only intended for fixed-width vectors");
+    // Vector retTypes are cast to i8 vectors. Recover original retType.
     cgm.errorNYI(loc, std::string("unimplemented vector compare"));
   }
 
-  // Scalar compare is a special case that is artifically converted to a
-  // 1-element vector compare. This is to guarantee that the output result is
-  // sign- rather than zero-extended.
-  //
-  // Specifically, a compare Op will generate an i1 result that needs to be
-  // extended to match the in/out type, `ty`. Regular scalar cast wwould lead
-  // to ZExt to preserve the value, e.g. 0b1 --> 0x00000001 (i1 -0> i16).
-  // Vector compare are meant to generate masks and these are exteded via SExt,
-  // so that 0b1 --> 0x11111111 and 0b0 --> 0x00000000.
-  bool scalarInputs = isa<cir::IntType>(src.getType());
-
-  mlir::Value zero = builder.getNullValue(ty, loc);
-  if (CmpInst::isFPPredicate(pred)) {
+  mlir::Value zero = builder.getNullValue(src.getType(), loc);
+  mlir::Value cmp;
+  if (cir::isFpCompare(kind)) {
     cgm.errorNYI(loc, std::string("unimplemented FP compare"));
     // TODO:
     //   if (Pred == CmpInst::FCMP_OEQ)
     //   else
   } else {
-    if (scalarInputs) {
-      cir::VectorType ty = cir::VectorType::get(src.getType(), 1, false);
-      src = cir::VecSplatOp::create(builder, loc, ty, src);
-      zero = cir::VecSplatOp::create(builder, loc, ty, zero);
-    }
+    if (scalarCmp)
+      // For scalars, cast !cir.bool to !cir.int<s, 1> so that the compare
+      // result is sign- rather zero-extended when casting to the output
+      // retType.
+      cmp = builder.createCast(
+          loc, cir::CastKind::bool_to_int,
+          builder.createCompare(loc, cir::CmpOpKind::eq, src, zero),
+          builder.getSIntNTy(1));
+    else
+      cgm.errorNYI(loc, std::string("unimplemented vector compare"));
   }
 
-  mlir::Value cmp =
-      builder.createVecCompare(loc, cir::CmpOpKind::eq, src, zero);
-
-  if (scalarInputs)
-    cmp =
-        cir::VecExtractOp::create(builder, loc, cmp, builder.getUInt64(0, loc));
-
-  return builder.createBitcast(loc, cmp, builder.getUInt64Ty());
+  return builder.createCast(loc, cir::CastKind::integral, cmp, retTy);
 }
 
 bool CIRGenFunction::getAArch64SVEProcessedOperands(
@@ -1334,8 +1325,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vceqzd_s64:
     ops.push_back(emitScalarExpr(expr->getArg(0)));
     return emitAArch64CompareBuiltinExpr(
-        loc, ops[0], convertType(expr->getArg(0)->getType()),
-        llvm::ICmpInst::ICMP_EQ);
+        loc, ops[0], convertType(expr->getCallReturnType(getContext())),
+        cir::CmpOpKind::eq);
   case NEON::BI__builtin_neon_vceqzd_f64:
   case NEON::BI__builtin_neon_vceqzs_f32:
   case NEON::BI__builtin_neon_vceqzh_f16:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 1008826b98951..57b915b3ba716 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1292,10 +1292,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// -----------------------------
   /// CIR emit functions -- AArch64
   /// -----------------------------
-  mlir::Value
-  emitAArch64CompareBuiltinExpr(mlir::Location loc, mlir::Value src,
-                                mlir::Type ty,
-                                const llvm::CmpInst::Predicate pred);
+  mlir::Value emitAArch64CompareBuiltinExpr(mlir::Location loc, mlir::Value src,
+                                            mlir::Type retTy,
+                                            cir::CmpOpKind kind);
 
 private:
   void emitAndUpdateRetAlloca(clang::QualType type, mlir::Location loc,
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index abdb6ad1ec6af..d481bff0b55b9 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -3062,6 +3062,9 @@ OpFoldResult cir::VecCmpOp::fold(FoldAdaptor adaptor) {
       }
       break;
     }
+    case cir::CmpOpKind::fcmp_first: {
+      llvm_unreachable("FP compare is not yet supported.");
+    }
     }
 
     elements[i] = cir::IntAttr::get(getType().getElementType(), cmpResult);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 4c97fb877fdae..89a498c615a49 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2704,6 +2704,8 @@ convertCmpKindToICmpPredicate(cir::CmpOpKind kind, bool isSigned) {
     return (isSigned ? LLVMICmp::sgt : LLVMICmp::ugt);
   case CIR::ge:
     return (isSigned ? LLVMICmp::sge : LLVMICmp::uge);
+  case CIR::fcmp_first:
+    llvm_unreachable("Unknown CmpOpKind");
   }
   llvm_unreachable("Unknown CmpOpKind");
 }
@@ -2727,6 +2729,8 @@ convertCmpKindToFCmpPredicate(cir::CmpOpKind kind) {
     return LLVMFCmp::ogt;
   case CIR::ge:
     return LLVMFCmp::oge;
+  case CIR::fcmp_first:
+    llvm_unreachable("Unknown CmpOpKind");
   }
   llvm_unreachable("Unknown CmpOpKind");
 }
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index 9400b28917cc6..6d0b25e6a66b7 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -1,8 +1,8 @@
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
-// RUN:           %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa             | FileCheck %s --check-prefixes=LLVM
-// RUN: %if cir %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa,instcombine | FileCheck %s --check-prefixes=LLVM %}
-// RUN: %if cir %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                                           FileCheck %s --check-prefixes=CIR %}
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                               FileCheck %s --check-prefixes=CIR %}
 
 //=============================================================================
 // NOTES
@@ -13,9 +13,6 @@
 //
 // Different labels for CIR stem from an additional function call that is
 // present at the AST and CIR levels, but is inlined at the LLVM IR level.
-//
-// For `-fclangir`, `instcombine` is used to e.g. fold 1-element vectors to
-// scalars.
 //=============================================================================
 
 #include <arm_neon.h>
@@ -24,12 +21,9 @@
 // CIR-LABEL: @vceqzd_s64
 uint64_t test_vceqzd_s64(int64_t a) {
 // CIR:   [[C_0:%.*]] = cir.const #cir.int<0>
-// CIR:   [[LHS:%.*]] = cir.vec.splat {{.*}} : !s64i, !cir.vector<1 x !s64i>
-// CIR:   [[RHS:%.*]] = cir.vec.splat [[C_0]] : !s64i, !cir.vector<1 x !s64i>
-// CIR:   [[CMP:%.*]] = cir.vec.cmp(eq, [[LHS]], [[RHS]]) : !cir.vector<1 x !s64i>, !cir.vector<1 x !s64i>
-// CIR:   [[C_0_1:%.*]] = cir.const #cir.int<0> : !u64i
-// CIR:   [[RES:%.*]] = cir.vec.extract [[CMP]][[[C_0_1]] : !u64i] : !cir.vector<1 x !s64i>
-// CIR:   cir.cast bitcast [[RES]] : !s64i -> !u64i
+// CIR:   [[CMP:%.*]] = cir.cmp(eq, %{{.*}}, [[C_0]]) : !s64i, !cir.bool
+// CIR:   [[RES:%.*]] = cir.cast bool_to_int [[CMP]] : !cir.bool -> !cir.int<s, 1>
+// CIR:   cir.cast integral [[RES]] : !cir.int<s, 1> -> !u64i
 
 // LLVM-SAME: i64{{.*}} [[A:%.*]])
 // LLVM:          [[TMP0:%.*]] = icmp eq i64 [[A]], 0
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 0aeabc7e36a61..c5e03e8e88aad 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -407,7 +407,7 @@ def calculate_arch_features(arch_string):
     config.available_features.add("llvm-driver")
 
 if config.clang_enable_cir:
-    config.available_features.add("cir")
+    config.available_features.add("cir-enabled")
 
 # Some tests perform deep recursion, which requires a larger pthread stack size
 # than the relatively low default of 192 KiB for 64-bit processes on AIX. The

>From 17f9fc4b0a40e1556e9cc4555c53a8630dd1085c Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 6 Feb 2026 18:27:21 +0000
Subject: [PATCH 3/3] Revert the changes to CIR_CmpOpKind

---
 clang/include/clang/CIR/Dialect/IR/CIROps.td        | 4 +---
 clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h    | 4 ----
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp      | 6 +-----
 clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 4 ----
 4 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index bb6922db92b55..6f2538c09ec5c 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1941,9 +1941,7 @@ def CIR_CmpOpKind : CIR_I32EnumAttr<"CmpOpKind", "compare operation kind", [
   I32EnumAttrCase<"gt", 2>,
   I32EnumAttrCase<"ge", 3>,
   I32EnumAttrCase<"eq", 4>,
-  I32EnumAttrCase<"ne", 5>,
-  // Floating-point only predicates
-  I32EnumAttrCase<"fcmp_first", 20> // TODO: Replace with a real FP compare
+  I32EnumAttrCase<"ne", 5>
 ]>;
 
 def CIR_CmpOp : CIR_Op<"cmp", [Pure, SameTypeOperands]> {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h b/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
index 96f3a81742412..dbd030446a6fc 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
@@ -123,10 +123,6 @@ template <typename Int> inline bool isValidCIRAtomicOrderingCABI(Int value) {
          value <= static_cast<Int>(cir::MemOrder::SequentiallyConsistent);
 }
 
-[[maybe_unused]] static bool isFpCompare(CmpOpKind kind) {
-  return kind >= CmpOpKind::fcmp_first;
-}
-
 } // namespace cir
 
 #endif // CLANG_CIR_DIALECT_IR_CIROPSENUMS_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 6c1ac99e199c5..22151b69146ff 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1,5 +1,4 @@
 //===---- CIRGenBuiltinAArch64.cpp - Emit CIR for AArch64 builtins --------===//
-//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -106,11 +105,8 @@ CIRGenFunction::emitAArch64CompareBuiltinExpr(mlir::Location loc,
 
   mlir::Value zero = builder.getNullValue(src.getType(), loc);
   mlir::Value cmp;
-  if (cir::isFpCompare(kind)) {
+  if (cir::isFPOrVectorOfFPType(src.getType())) {
     cgm.errorNYI(loc, std::string("unimplemented FP compare"));
-    // TODO:
-    //   if (Pred == CmpInst::FCMP_OEQ)
-    //   else
   } else {
     if (scalarCmp)
       // For scalars, cast !cir.bool to !cir.int<s, 1> so that the compare
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 89a498c615a49..4c97fb877fdae 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2704,8 +2704,6 @@ convertCmpKindToICmpPredicate(cir::CmpOpKind kind, bool isSigned) {
     return (isSigned ? LLVMICmp::sgt : LLVMICmp::ugt);
   case CIR::ge:
     return (isSigned ? LLVMICmp::sge : LLVMICmp::uge);
-  case CIR::fcmp_first:
-    llvm_unreachable("Unknown CmpOpKind");
   }
   llvm_unreachable("Unknown CmpOpKind");
 }
@@ -2729,8 +2727,6 @@ convertCmpKindToFCmpPredicate(cir::CmpOpKind kind) {
     return LLVMFCmp::ogt;
   case CIR::ge:
     return LLVMFCmp::oge;
-  case CIR::fcmp_first:
-    llvm_unreachable("Unknown CmpOpKind");
   }
   llvm_unreachable("Unknown CmpOpKind");
 }