[clang] [CIR][X86] Add support for cmp builtins (PR #174318)

Thu Jan 8 05:04:32 PST 2026

https://github.com/YGGkk updated https://github.com/llvm/llvm-project/pull/174318

>From 21b98b98581eed27dfed20f96c47d38680d0e36f Mon Sep 17 00:00:00 2001
From: Zhihui Yang <youngwisdm at gmail.com>
Date: Sun, 4 Jan 2026 00:46:24 -0800
Subject: [PATCH 1/3] [PATCH] [PATCH] [CIR][X86] Add support for cmp builtins

---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |   4 +
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |   8 +
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  87 ++-
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |   8 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  33 +-
 .../CIR/CodeGenBuiltins/X86/cmp-builtins.c    | 719 ++++++++++++++++++
 6 files changed, 853 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/cmp-builtins.c

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 8edb796884b5c..fe3bc846ddbc9 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -479,6 +479,10 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return createAddrSpaceCast(src.getLoc(), src, newTy);
   }
 
+  mlir::Value createVectorToIntCast(mlir::Location loc, mlir::Value src, mlir::Type newTy) {
+    return createCast(loc, cir::CastKind::vector_to_int, src, newTy);
+  }
+
   //===--------------------------------------------------------------------===//
   // Binary Operators
   //===--------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 74e0860762ec6..c2be3359e3af2 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -177,6 +177,7 @@ def CIR_CastKind : CIR_I32EnumAttr<"CastKind", "cast kind", [
 
   // Enums below are specific to CIR and don't have a correspondence to classic
   // codegen:
+  I32EnumAttrCase<"vector_to_int", 999>,
   I32EnumAttrCase<"bool_to_float", 1000>,
 ]>;
 
@@ -217,6 +218,13 @@ def CIR_CastOp : CIR_Op<"cast", [
     CIR also supports some additional conversions that are not part of the classic
     Clang codegen:
 
+    - `vector_to_int`
+
+    Example:
+    ```mlir
+    %4 = cir.cast vector_to_int %3 :!cir.vector<16 x !s8i> -> !u16i
+    ```
+
     - `bool_to_float`
 
     Example:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 75bf25b20f1af..70a206fb4b0e7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -417,6 +417,81 @@ static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,
   return builder.createVecCompare(loc, pred, op0, op1);
 }
 
+static mlir::Value emitX86MaskedCompare(CIRGenBuilderTy &builder, mlir::Location loc,
+                                        llvm::SmallVector<mlir::Value> ops, bool isSigned = true)
+{
+
+  uint64_t imm = CIRGenFunction::getZExtIntValueFromConstOp(ops[2]) & 0x7;
+  cir::VectorType ty = cast<cir::VectorType>(ops[0].getType());
+  cir::IntType elementTy = cast<cir::IntType>(ty.getElementType());
+  unsigned numElts = ty.getSize();
+  mlir::Value cmp;
+  if (imm == 3)
+  {
+    cmp = builder.getNullValue(cir::VectorType::get(builder.getSIntNTy(1), numElts), loc);
+  }
+  else if (imm == 7)
+  {
+    llvm::APInt allOnes = llvm::APInt::getAllOnes(elementTy.getWidth());
+    cmp = cir::VecSplatOp::create(
+        builder, loc, ty, builder.getConstAPInt(loc, elementTy, allOnes));
+  }
+  else
+  {
+    cir::CmpOpKind pred;
+    switch(imm) {
+      default:
+        llvm_unreachable("Unknown condition code");
+      case 0:
+        pred = cir::CmpOpKind::eq;
+        break;
+      case 1:
+        pred = cir::CmpOpKind::lt;
+        break;
+      case 2:
+        pred = cir::CmpOpKind::le;
+        break;
+      case 4:
+        pred = cir::CmpOpKind::ne;
+        break;
+      case 5:
+        pred = cir::CmpOpKind::ge;
+        break;
+      case 6:
+        pred = cir::CmpOpKind::gt;
+        break;
+    }
+      cir::VectorType integralVecTy = cir::VectorType::get(builder.getUIntNTy(1), numElts);
+      cmp = cir::VecCmpOp::create(builder, loc, integralVecTy, pred, ops[0], ops[1]);
+    }
+  mlir::Value maskIn = nullptr;
+  if (ops.size() == 4)
+    maskIn = ops[3];
+
+  if (maskIn)
+  {
+    auto castOp = mlir::dyn_cast_or_null<cir::CastOp>(maskIn.getDefiningOp());
+    if (!castOp)
+    {
+      auto maskVec = getMaskVecValue(builder, loc, maskIn, numElts);
+      cmp = builder.createAnd(loc, cmp, maskVec);
+    }
+  }
+  if (numElts < 8)
+  {
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    SmallVector<mlir::Attribute, 8> indices;
+    for (unsigned i = 0; i != numElts; ++i)
+      indices.push_back(cir::IntAttr::get(i32Ty, i));
+    for (unsigned i = numElts; i != 8; ++i)
+      indices.push_back(cir::IntAttr::get(i32Ty, i % numElts + numElts));
+    cmp = builder.createVecShuffle(loc, cmp, builder.getNullValue(cmp.getType(), loc), indices);
+  }
+  auto result = builder.createVectorToIntCast(
+      loc, cmp, builder.getUIntNTy(std::max(numElts, 8U)));
+  return result;
+  }
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -1338,6 +1413,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_selectsbf_128:
   case X86::BI__builtin_ia32_selectss_128:
   case X86::BI__builtin_ia32_selectsd_128:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_cmpb128_mask:
   case X86::BI__builtin_ia32_cmpb256_mask:
   case X86::BI__builtin_ia32_cmpb512_mask:
@@ -1361,11 +1440,9 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_ucmpd512_mask:
   case X86::BI__builtin_ia32_ucmpq128_mask:
   case X86::BI__builtin_ia32_ucmpq256_mask:
-  case X86::BI__builtin_ia32_ucmpq512_mask:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented X86 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return {};
+  case X86::BI__builtin_ia32_ucmpq512_mask: {
+    return emitX86MaskedCompare(builder, getLoc(expr->getExprLoc()), ops);
+  }
   case X86::BI__builtin_ia32_vpcomb:
   case X86::BI__builtin_ia32_vpcomw:
   case X86::BI__builtin_ia32_vpcomd:
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index d888fdcf081e7..f6c54ee210d91 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -656,6 +656,14 @@ LogicalResult cir::CastOp::verify() {
              << "requires floating point !cir.complex type for result";
     return success();
   }
+  case cir::CastKind::vector_to_int: {
+    auto vectorTy = mlir::dyn_cast<cir::VectorType>(srcType);
+    if (!vectorTy)
+      return emitOpError() << "requires !cir.vector type for source";
+    if (!mlir::isa<cir::IntType>(resType))
+      return emitOpError() << "requires !cir.int type for result";
+    return success();
+  }
   default:
     llvm_unreachable("Unknown CastOp kind?");
   }
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index eeb886445ede4..6d550228af27d 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1334,6 +1334,29 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite(
     assert(!MissingFeatures::cxxABI());
     assert(!MissingFeatures::methodType());
     break;
+  case cir::CastKind::vector_to_int: {
+    mlir::Type srcType = castOp.getSrc().getType();
+    mlir::Type dstType = castOp.getType();
+    mlir::Value llvmSrcVal = adaptor.getSrc();
+    cir::IntType srcIntType =
+    mlir::cast<cir::IntType>(elementTypeIfVector(srcType));
+    cir::IntType dstIntType =
+        mlir::cast<cir::IntType>(elementTypeIfVector(dstType));
+    uint64_t numElements =
+        mlir::cast<cir::VectorType>(srcType).getSize();
+
+    auto width = numElements * srcIntType.getWidth();
+    auto convertIntTypeOp = rewriter.create<mlir::LLVM::BitcastOp>(castOp.getLoc(), rewriter.getIntegerType(width),
+                                                       llvmSrcVal);
+    // truncate to the destination integer type
+    auto dstWidth = dstIntType.getWidth();
+    auto srcWidth = mlir::cast<mlir::IntegerType>(convertIntTypeOp.getResult().getType()).getWidth();
+    auto truncOp = rewriter.create<mlir::LLVM::TruncOp>(
+        convertIntTypeOp.getLoc(), rewriter.getIntegerType(dstWidth),
+        convertIntTypeOp.getResult());
+    rewriter.replaceOp(castOp, truncOp);
+    break;
+  }
   default: {
     return castOp.emitError("Unhandled cast kind: ")
            << castOp.getKindAttrName();
@@ -2650,7 +2673,6 @@ mlir::LogicalResult CIRToLLVMCmpOpLowering::matchAndRewrite(
       return mlir::success();
     }
   }
-
   return cmpOp.emitError() << "unsupported type for CmpOp: " << type;
 }
 
@@ -3590,6 +3612,15 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite(
 
   // LLVM IR vector comparison returns a vector of i1. This one-bit vector
   // must be sign-extended to the correct result type.
+  auto vecElementType = elementTypeIfVector(op.getType());
+  if (auto intType = mlir::dyn_cast<cir::IntType>(vecElementType))
+  {
+    if (intType.getWidth() == 1)
+    {
+      rewriter.replaceOp(op, bitResult);
+      return mlir::success();
+    }
+  }
   rewriter.replaceOpWithNewOp<mlir::LLVM::SExtOp>(
       op, typeConverter->convertType(op.getType()), bitResult);
   return mlir::success();
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/cmp-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/cmp-builtins.c
new file mode 100644
index 0000000000000..f37be51c43216
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/cmp-builtins.c
@@ -0,0 +1,719 @@
+// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw -Wno-implicit-function-declaration -fclangir -emit-cir -o %t.cir %s
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw -Wno-implicit-function-declaration -fclangir -emit-cir -o %t.cir %s
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw -Wno-implicit-function-declaration -fclangir -emit-llvm -o %t.ll %s
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw -Wno-implicit-function-declaration -fclangir -emit-llvm -o %t.ll %s
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -ffreestanding -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+
+// RUN: %clang_cc1 -x c -ffreestanding -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512bw  -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+
+#include <immintrin.h>
+
+__mmask16 test_mm_cmp_epi8_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epi8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm_cmp_epi8_mask
+  // LLVM: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_cmp_epi8_mask
+  // OGCG: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 0);
+}
+
+__mmask16 test_mm_cmp_epi8_mask_imm3(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epi8_mask
+  // CIR: cir.const #cir.zero : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+  // LLVM-LABEL: test_mm_cmp_epi8_mask
+  // LLVM: store i16 0, ptr %{{.*}}, align 2
+  // LLVM: load i16, ptr %{{.*}}, align 2
+  // LLVM: ret i16 %{{.*}}
+  // OGCG-LABEL: test_mm_cmp_epi8_mask
+  // OGCG: ret i16 0
+  return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 3);
+}
+
+__mmask16 test_mm_cmp_epi8_mask_imm7(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epi8_mask
+  // CIR: cir.vec.splat {{%.*}} : !s8i, !cir.vector<16 x !s8i>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !s8i> -> !u16i
+  // LLVM-LABEL: test_mm_cmp_epi8_mask
+  // LLVM: store i16 -1, ptr %{{.*}}, align 2
+  // LLVM: load i16, ptr %{{.*}}, align 2
+  // LLVM: ret i16 %{{.*}}
+  // OGCG-LABEL: test_mm_cmp_epi8_mask
+  // OGCG: ret i16 -1
+  return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 7);
+}
+
+__mmask16 test_mm_mask_cmp_epi8_mask(__mmask16 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epi8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm_mask_cmp_epi8_mask
+  // LLVM: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // LLVM: and <16 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_mask_cmp_epi8_mask
+  // OGCG: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // OGCG: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmp_epi8_mask(__m, __a, __b, 0);
+}
+
+__mmask32 test_mm256_cmp_epi8_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epi8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s8i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm256_cmp_epi8_mask
+  // LLVM: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_cmp_epi8_mask
+  // OGCG: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmp_epi8_mask(__a, __b, 0);
+}
+
+__mmask32 test_mm256_mask_cmp_epi8_mask(__mmask32 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epi8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s8i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm256_mask_cmp_epi8_mask
+  // LLVM: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // LLVM: and <32 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_mask_cmp_epi8_mask
+  // OGCG: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // OGCG: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmp_epi8_mask(__m, __a, __b, 0);
+}
+
+__mmask64 test_mm512_cmp_epi8_mask(__m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_cmp_epi8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<64 x !s8i>, !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+  // LLVM-LABEL: test_mm512_cmp_epi8_mask
+  // LLVM: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_cmp_epi8_mask
+  // OGCG: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_cmp_epi8_mask(__a, __b, 0);
+}
+
+__mmask64 test_mm512_mask_cmp_epi8_mask(__mmask64 __m, __m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_mask_cmp_epi8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<64 x !s8i>, !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+  // LLVM-LABEL: test_mm512_mask_cmp_epi8_mask
+  // LLVM: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // LLVM: and <64 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_mask_cmp_epi8_mask
+  // OGCG: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // OGCG: and <64 x i1> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_mask_cmp_epi8_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm_cmp_epi16_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epi16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_cmp_epi16_mask
+  // LLVM: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_cmp_epi16_mask
+  // OGCG: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi16_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm_mask_cmp_epi16_mask(__mmask8 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epi16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_mask_cmp_epi16_mask
+  // LLVM: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // LLVM: and <8 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_mask_cmp_epi16_mask
+  // OGCG: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // OGCG: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epi16_mask(__m, __a, __b, 0);
+}
+
+__mmask16 test_mm256_cmp_epi16_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epi16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s16i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm256_cmp_epi16_mask
+  // LLVM: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_cmp_epi16_mask
+  // OGCG: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmp_epi16_mask(__a, __b, 0);
+}
+
+__mmask16 test_mm256_mask_cmp_epi16_mask(__mmask16 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epi16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s16i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm256_mask_cmp_epi16_mask
+  // LLVM: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // LLVM: and <16 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_mask_cmp_epi16_mask
+  // OGCG: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // OGCG: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmp_epi16_mask(__m, __a, __b, 0);
+}
+
+__mmask32 test_mm512_cmp_epi16_mask(__m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_cmp_epi16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s16i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm512_cmp_epi16_mask
+  // LLVM: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_cmp_epi16_mask
+  // OGCG: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_cmp_epi16_mask(__a, __b, 0);
+}
+
+__mmask32 test_mm512_mask_cmp_epi16_mask(__mmask32 __m, __m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_mask_cmp_epi16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s16i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm512_mask_cmp_epi16_mask
+  // LLVM: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // LLVM: and <32 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_mask_cmp_epi16_mask
+  // OGCG: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // OGCG: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_mask_cmp_epi16_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm_cmp_epi32_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epi32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s32i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_cmp_epi32_mask
+  // LLVM: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm_cmp_epi32_mask
+  // OGCG: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm_cmp_epi32_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm_mask_cmp_epi32_mask(__mmask8 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epi32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s32i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_mask_cmp_epi32_mask
+  // LLVM: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: and <4 x i1> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm_mask_cmp_epi32_mask
+  // OGCG: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // OGCG: and <4 x i1> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm_mask_cmp_epi32_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm256_cmp_epi32_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epi32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s32i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_cmp_epi32_mask
+  // LLVM: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_cmp_epi32_mask
+  // OGCG: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epi32_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm256_mask_cmp_epi32_mask(__mmask8 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epi32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s32i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_mask_cmp_epi32_mask
+  // LLVM: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // LLVM: and <8 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_mask_cmp_epi32_mask
+  // OGCG: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // OGCG: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epi32_mask(__m, __a, __b, 0);
+}
+
+__mmask16 test_mm512_cmp_epi32_mask(__m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_cmp_epi32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s32i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm512_cmp_epi32_mask
+  // LLVM: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_cmp_epi32_mask
+  // OGCG: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, 0);
+}
+
+__mmask16 test_mm512_mask_cmp_epi32_mask(__mmask16 __m, __m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_mask_cmp_epi32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s32i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm512_mask_cmp_epi32_mask
+  // LLVM: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // LLVM: and <16 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_mask_cmp_epi32_mask
+  // OGCG: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // OGCG: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_mask_cmp_epi32_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm_cmp_epi64_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epi64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<2 x !s64i>, !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<2 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_cmp_epi64_mask
+  // LLVM: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  // OGCG-LABEL: test_mm_cmp_epi64_mask
+  // OGCG: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  return (__mmask8)_mm_cmp_epi64_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm_mask_cmp_epi64_mask(__mmask8 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epi64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<2 x !s64i>, !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<2 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_mask_cmp_epi64_mask
+  // LLVM: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // LLVM: and <2 x i1> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  // OGCG-LABEL: test_mm_mask_cmp_epi64_mask
+  // OGCG: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // OGCG: and <2 x i1> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  return (__mmask8)_mm_mask_cmp_epi64_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm256_cmp_epi64_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epi64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s64i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_cmp_epi64_mask
+  // LLVM: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm256_cmp_epi64_mask
+  // OGCG: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm256_cmp_epi64_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm256_mask_cmp_epi64_mask(__mmask8 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epi64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s64i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_mask_cmp_epi64_mask
+  // LLVM: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: and <4 x i1> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm256_mask_cmp_epi64_mask
+  // OGCG: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // OGCG: and <4 x i1> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm256_mask_cmp_epi64_mask(__m, __a, __b, 0);
+}
+
+__mmask16 test_mm_cmp_epu8_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epu8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm_cmp_epu8_mask
+  // LLVM: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_cmp_epu8_mask
+  // OGCG: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmp_epu8_mask(__a, __b, 0);
+}
+
+__mmask16 test_mm_mask_cmp_epu8_mask(__mmask16 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epu8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm_mask_cmp_epu8_mask
+  // LLVM: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // LLVM: and <16 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_mask_cmp_epu8_mask
+  // OGCG: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // OGCG: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmp_epu8_mask(__m, __a, __b, 0);
+}
+
+__mmask32 test_mm256_cmp_epu8_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epu8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s8i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm256_cmp_epu8_mask
+  // LLVM: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_cmp_epu8_mask
+  // OGCG: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmp_epu8_mask(__a, __b, 0);
+}
+
+__mmask32 test_mm256_mask_cmp_epu8_mask(__mmask32 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epu8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s8i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm256_mask_cmp_epu8_mask
+  // LLVM: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // LLVM: and <32 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_mask_cmp_epu8_mask
+  // OGCG: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // OGCG: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmp_epu8_mask(__m, __a, __b, 0);
+}
+
+__mmask64 test_mm512_cmp_epu8_mask(__m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_cmp_epu8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<64 x !s8i>, !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+  // LLVM-LABEL: test_mm512_cmp_epu8_mask
+  // LLVM: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_cmp_epu8_mask
+  // OGCG: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_cmp_epu8_mask(__a, __b, 0);
+}
+
+__mmask64 test_mm512_mask_cmp_epu8_mask(__mmask64 __m, __m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_mask_cmp_epu8_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<64 x !s8i>, !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+  // LLVM-LABEL: test_mm512_mask_cmp_epu8_mask
+  // LLVM: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // LLVM: and <64 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_mask_cmp_epu8_mask
+  // OGCG: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // OGCG: and <64 x i1> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_mask_cmp_epu8_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm_cmp_epu16_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epu16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_cmp_epu16_mask
+  // LLVM: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_cmp_epu16_mask
+  // OGCG: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epu16_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm_mask_cmp_epu16_mask(__mmask8 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epu16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_mask_cmp_epu16_mask
+  // LLVM: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // LLVM: and <8 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm_mask_cmp_epu16_mask
+  // OGCG: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // OGCG: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epu16_mask(__m, __a, __b, 0);
+}
+
+__mmask16 test_mm256_cmp_epu16_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epu16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s16i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm256_cmp_epu16_mask
+  // LLVM: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_cmp_epu16_mask
+  // OGCG: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmp_epu16_mask(__a, __b, 0);
+}
+
+__mmask16 test_mm256_mask_cmp_epu16_mask(__mmask16 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epu16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s16i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm256_mask_cmp_epu16_mask
+  // LLVM: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // LLVM: and <16 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_mask_cmp_epu16_mask
+  // OGCG: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // OGCG: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmp_epu16_mask(__m, __a, __b, 0);
+}
+
+__mmask32 test_mm512_cmp_epu16_mask(__m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_cmp_epu16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s16i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm512_cmp_epu16_mask
+  // LLVM: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_cmp_epu16_mask
+  // OGCG: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_cmp_epu16_mask(__a, __b, 0);
+}
+
+__mmask32 test_mm512_mask_cmp_epu16_mask(__mmask32 __m, __m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_mask_cmp_epu16_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<32 x !s16i>, !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+  // LLVM-LABEL: test_mm512_mask_cmp_epu16_mask
+  // LLVM: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // LLVM: and <32 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_mask_cmp_epu16_mask
+  // OGCG: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // OGCG: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_mask_cmp_epu16_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm_cmp_epu32_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epu32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s32i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_cmp_epu32_mask
+  // LLVM: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm_cmp_epu32_mask
+  // OGCG: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm_cmp_epu32_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm_mask_cmp_epu32_mask(__mmask8 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epu32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s32i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_mask_cmp_epu32_mask
+  // LLVM: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: and <4 x i1> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm_mask_cmp_epu32_mask
+  // OGCG: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // OGCG: and <4 x i1> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm_mask_cmp_epu32_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm256_cmp_epu32_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epu32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s32i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_cmp_epu32_mask
+  // LLVM: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_cmp_epu32_mask
+  // OGCG: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epu32_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm256_mask_cmp_epu32_mask(__mmask8 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epu32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s32i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_mask_cmp_epu32_mask
+  // LLVM: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // LLVM: and <8 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm256_mask_cmp_epu32_mask
+  // OGCG: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // OGCG: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epu32_mask(__m, __a, __b, 0);
+}
+
+__mmask16 test_mm512_cmp_epu32_mask(__m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_cmp_epu32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s32i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm512_cmp_epu32_mask
+  // LLVM: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_cmp_epu32_mask
+  // OGCG: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_cmp_epu32_mask(__a, __b, 0);
+}
+
+__mmask16 test_mm512_mask_cmp_epu32_mask(__mmask16 __m, __m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_mask_cmp_epu32_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<16 x !s32i>, !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+  // LLVM-LABEL: test_mm512_mask_cmp_epu32_mask
+  // LLVM: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // LLVM: and <16 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_mask_cmp_epu32_mask
+  // OGCG: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // OGCG: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_mask_cmp_epu32_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm_cmp_epu64_mask(__m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_cmp_epu64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<2 x !s64i>, !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<2 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_cmp_epu64_mask
+  // LLVM: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  // OGCG-LABEL: test_mm_cmp_epu64_mask
+  // OGCG: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  return (__mmask8)_mm_cmp_epu64_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm_mask_cmp_epu64_mask(__mmask8 __m, __m128i __a, __m128i __b) {
+  // CIR-LABEL: test_mm_mask_cmp_epu64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<2 x !s64i>, !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<2 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<2 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm_mask_cmp_epu64_mask
+  // LLVM: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // LLVM: and <2 x i1> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  // OGCG-LABEL: test_mm_mask_cmp_epu64_mask
+  // OGCG: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // OGCG: and <2 x i1> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  return (__mmask8)_mm_mask_cmp_epu64_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm256_cmp_epu64_mask(__m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_cmp_epu64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s64i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_cmp_epu64_mask
+  // LLVM: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm256_cmp_epu64_mask
+  // OGCG: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm256_cmp_epu64_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm256_mask_cmp_epu64_mask(__mmask8 __m, __m256i __a, __m256i __b) {
+  // CIR-LABEL: test_mm256_mask_cmp_epu64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<4 x !s64i>, !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.const #cir.zero : !cir.vector<4 x !cir.int<u, 1>>
+  // CIR: cir.vec.shuffle({{%.*}}, {{%.*}} : !cir.vector<4 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm256_mask_cmp_epu64_mask
+  // LLVM: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: and <4 x i1> %{{.*}}, %{{.*}}
+  // LLVM: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG-LABEL: test_mm256_mask_cmp_epu64_mask
+  // OGCG: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // OGCG: and <4 x i1> %{{.*}}, %{{.*}}
+  // OGCG: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return (__mmask8)_mm256_mask_cmp_epu64_mask(__m, __a, __b, 0);
+}
+
+__mmask8 test_mm512_cmp_epu64_mask(__m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_cmp_epu64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s64i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm512_cmp_epu64_mask
+  // LLVM: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_cmp_epu64_mask
+  // OGCG: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_cmp_epu64_mask(__a, __b, 0);
+}
+
+__mmask8 test_mm512_mask_cmp_epu64_mask(__mmask8 __m, __m512i __a, __m512i __b) {
+  // CIR-LABEL: test_mm512_mask_cmp_epu64_mask
+  // CIR: cir.vec.cmp(eq, {{%.*}}, {{%.*}}) : !cir.vector<8 x !s64i>, !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{%.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{%.*}}, {{%.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: cir.cast vector_to_int {{%.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+  // LLVM-LABEL: test_mm512_mask_cmp_epu64_mask
+  // LLVM: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // LLVM: and <8 x i1> %{{.*}}, %{{.*}}
+  // OGCG-LABEL: test_mm512_mask_cmp_epu64_mask
+  // OGCG: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // OGCG: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_mask_cmp_epu64_mask(__m, __a, __b, 0);
+}

>From 33f03922e48375b31a5ed5bc017fc6d9bf8ff2a5 Mon Sep 17 00:00:00 2001
From: Zhihui Yang <youngwisdm at gmail.com>
Date: Mon, 5 Jan 2026 05:53:29 -0800
Subject: [PATCH 2/3] [task] fix the commit suggestion

---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |   3 +-
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 147 +++++++++---------
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  89 +++++------
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |   3 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  25 ++-
 5 files changed, 131 insertions(+), 136 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index fe3bc846ddbc9..8001648efc205 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -479,7 +479,8 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return createAddrSpaceCast(src.getLoc(), src, newTy);
   }
 
-  mlir::Value createVectorToIntCast(mlir::Location loc, mlir::Value src, mlir::Type newTy) {
+  mlir::Value createVectorToIntCast(mlir::Location loc, mlir::Value src,
+                                    mlir::Type newTy) {
     return createCast(loc, cir::CastKind::vector_to_int, src, newTy);
   }
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index c2be3359e3af2..2dd0dc3416b57 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -105,80 +105,79 @@ class HasAtMostOneOfAttrs<list<string> names> : PredOpTrait<
 // CastOp
 //===----------------------------------------------------------------------===//
 
-def CIR_CastKind : CIR_I32EnumAttr<"CastKind", "cast kind", [
-  I32EnumAttrCase<"bitcast", 1>,
-  // CK_LValueBitCast
-  // CK_LValueToRValueBitCast
-  // CK_LValueToRValue
-  // CK_NoOp
-  // CK_BaseToDerived
-  // CK_DerivedToBase
-  // CK_UncheckedDerivedToBase
-  // CK_Dynamic
-  // CK_ToUnion
-  I32EnumAttrCase<"array_to_ptrdecay", 11>,
-  // CK_FunctionToPointerDecay
-  // CK_NullToPointer
-  // CK_NullToMemberPointer
-  // CK_BaseToDerivedMemberPointer
-  // CK_DerivedToBaseMemberPointer
-  I32EnumAttrCase<"member_ptr_to_bool", 17>,
-  // CK_ReinterpretMemberPointer
-  // CK_UserDefinedConversion
-  // CK_ConstructorConversion
-  I32EnumAttrCase<"int_to_ptr", 21>,
-  I32EnumAttrCase<"ptr_to_int", 22>,
-  I32EnumAttrCase<"ptr_to_bool", 23>,
-  // CK_ToVoid
-  // CK_MatrixCast
-  // CK_VectorSplat
-  I32EnumAttrCase<"integral", 27>,
-  I32EnumAttrCase<"int_to_bool", 28>,
-  I32EnumAttrCase<"int_to_float", 29>,
-  // CK_FloatingToFixedPoint
-  // CK_FixedPointToFloating
-  // CK_FixedPointCast
-  // CK_FixedPointToIntegral
-  // CK_IntegralToFixedPoint
-  // CK_FixedPointToBoolean
-  I32EnumAttrCase<"float_to_int", 36>,
-  I32EnumAttrCase<"float_to_bool", 37>,
-  I32EnumAttrCase<"bool_to_int", 38>,
-  I32EnumAttrCase<"floating", 39>,
-  // CK_CPointerToObjCPointerCast
-  // CK_BlockPointerToObjCPointerCast
-  // CK_AnyPointerToBlockPointerCast
-  // CK_ObjCObjectLValueCast
-  I32EnumAttrCase<"float_to_complex", 44>,
-  I32EnumAttrCase<"float_complex_to_real", 45>,
-  I32EnumAttrCase<"float_complex_to_bool", 46>,
-  I32EnumAttrCase<"float_complex", 47>,
-  I32EnumAttrCase<"float_complex_to_int_complex", 48>,
-  I32EnumAttrCase<"int_to_complex", 49>,
-  I32EnumAttrCase<"int_complex_to_real", 50>,
-  I32EnumAttrCase<"int_complex_to_bool", 51>,
-  I32EnumAttrCase<"int_complex", 52>,
-  I32EnumAttrCase<"int_complex_to_float_complex", 53>,
-  // CK_ARCProduceObject
-  // CK_ARCConsumeObject
-  // CK_ARCReclaimReturnedObject
-  // CK_ARCExtendBlockObject
-  // CK_AtomicToNonAtomic
-  // CK_NonAtomicToAtomic
-  // CK_CopyAndAutoreleaseBlockObject
-  // CK_BuiltinFnToFnPtr
-  // CK_ZeroToOCLOpaqueType
-  I32EnumAttrCase<"address_space", 63>,
-  // CK_IntToOCLSampler
-  // CK_HLSLVectorTruncation
-  // CK_HLSLArrayRValue
-  // CK_HLSLElementwiseCast
-  // CK_HLSLAggregateSplatCast
-
-  // Enums below are specific to CIR and don't have a correspondence to classic
-  // codegen:
-  I32EnumAttrCase<"vector_to_int", 999>,
-  I32EnumAttrCase<"bool_to_float", 1000>,
+def CIR_CastKind
+    : CIR_I32EnumAttr<
+          "CastKind", "cast kind",
+          [I32EnumAttrCase<"bitcast", 1>,
+           // CK_LValueBitCast
+           // CK_LValueToRValueBitCast
+           // CK_LValueToRValue
+           // CK_NoOp
+           // CK_BaseToDerived
+           // CK_DerivedToBase
+           // CK_UncheckedDerivedToBase
+           // CK_Dynamic
+           // CK_ToUnion
+           I32EnumAttrCase<"array_to_ptrdecay", 11>,
+           // CK_FunctionToPointerDecay
+           // CK_NullToPointer
+           // CK_NullToMemberPointer
+           // CK_BaseToDerivedMemberPointer
+           // CK_DerivedToBaseMemberPointer
+           I32EnumAttrCase<"member_ptr_to_bool", 17>,
+           // CK_ReinterpretMemberPointer
+           // CK_UserDefinedConversion
+           // CK_ConstructorConversion
+           I32EnumAttrCase<"int_to_ptr", 21>, I32EnumAttrCase<"ptr_to_int", 22>,
+           I32EnumAttrCase<"ptr_to_bool", 23>,
+           // CK_ToVoid
+           // CK_MatrixCast
+           // CK_VectorSplat
+           I32EnumAttrCase<"integral", 27>, I32EnumAttrCase<"int_to_bool", 28>,
+           I32EnumAttrCase<"int_to_float", 29>,
+           // CK_FloatingToFixedPoint
+           // CK_FixedPointToFloating
+           // CK_FixedPointCast
+           // CK_FixedPointToIntegral
+           // CK_IntegralToFixedPoint
+           // CK_FixedPointToBoolean
+           I32EnumAttrCase<"float_to_int", 36>,
+           I32EnumAttrCase<"float_to_bool", 37>,
+           I32EnumAttrCase<"bool_to_int", 38>, I32EnumAttrCase<"floating", 39>,
+           // CK_CPointerToObjCPointerCast
+           // CK_BlockPointerToObjCPointerCast
+           // CK_AnyPointerToBlockPointerCast
+           // CK_ObjCObjectLValueCast
+           I32EnumAttrCase<"float_to_complex", 44>,
+           I32EnumAttrCase<"float_complex_to_real", 45>,
+           I32EnumAttrCase<"float_complex_to_bool", 46>,
+           I32EnumAttrCase<"float_complex", 47>,
+           I32EnumAttrCase<"float_complex_to_int_complex", 48>,
+           I32EnumAttrCase<"int_to_complex", 49>,
+           I32EnumAttrCase<"int_complex_to_real", 50>,
+           I32EnumAttrCase<"int_complex_to_bool", 51>,
+           I32EnumAttrCase<"int_complex", 52>,
+           I32EnumAttrCase<"int_complex_to_float_complex", 53>,
+           // CK_ARCProduceObject
+           // CK_ARCConsumeObject
+           // CK_ARCReclaimReturnedObject
+           // CK_ARCExtendBlockObject
+           // CK_AtomicToNonAtomic
+           // CK_NonAtomicToAtomic
+           // CK_CopyAndAutoreleaseBlockObject
+           // CK_BuiltinFnToFnPtr
+           // CK_ZeroToOCLOpaqueType
+           I32EnumAttrCase<"address_space", 63>,
+           // CK_IntToOCLSampler
+           // CK_HLSLVectorTruncation
+           // CK_HLSLArrayRValue
+           // CK_HLSLElementwiseCast
+           // CK_HLSLAggregateSplatCast
+
+           // Enums below are specific to CIR and don't have a correspondence to
+           // classic codegen:
+           I32EnumAttrCase<"vector_to_int", 999>,
+           I32EnumAttrCase<"bool_to_float", 1000>,
 ]>;
 
 def CIR_CastOp : CIR_Op<"cast", [
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 70a206fb4b0e7..65f9f1389c478 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -417,80 +417,77 @@ static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,
   return builder.createVecCompare(loc, pred, op0, op1);
 }
 
-static mlir::Value emitX86MaskedCompare(CIRGenBuilderTy &builder, mlir::Location loc,
-                                        llvm::SmallVector<mlir::Value> ops, bool isSigned = true)
-{
+static mlir::Value emitX86MaskedCompare(CIRGenBuilderTy &builder,
+                                        mlir::Location loc,
+                                        llvm::SmallVector<mlir::Value> ops,
+                                        bool isSigned = true) {
 
   uint64_t imm = CIRGenFunction::getZExtIntValueFromConstOp(ops[2]) & 0x7;
   cir::VectorType ty = cast<cir::VectorType>(ops[0].getType());
   cir::IntType elementTy = cast<cir::IntType>(ty.getElementType());
   unsigned numElts = ty.getSize();
   mlir::Value cmp;
-  if (imm == 3)
-  {
-    cmp = builder.getNullValue(cir::VectorType::get(builder.getSIntNTy(1), numElts), loc);
-  }
-  else if (imm == 7)
-  {
+  if (imm == 3) {
+    cmp = builder.getNullValue(
+        cir::VectorType::get(builder.getSIntNTy(1), numElts), loc);
+  } else if (imm == 7) {
     llvm::APInt allOnes = llvm::APInt::getAllOnes(elementTy.getWidth());
     cmp = cir::VecSplatOp::create(
         builder, loc, ty, builder.getConstAPInt(loc, elementTy, allOnes));
-  }
-  else
-  {
+  } else {
     cir::CmpOpKind pred;
-    switch(imm) {
-      default:
-        llvm_unreachable("Unknown condition code");
-      case 0:
-        pred = cir::CmpOpKind::eq;
-        break;
-      case 1:
-        pred = cir::CmpOpKind::lt;
-        break;
-      case 2:
-        pred = cir::CmpOpKind::le;
-        break;
-      case 4:
-        pred = cir::CmpOpKind::ne;
-        break;
-      case 5:
-        pred = cir::CmpOpKind::ge;
-        break;
-      case 6:
-        pred = cir::CmpOpKind::gt;
-        break;
-    }
-      cir::VectorType integralVecTy = cir::VectorType::get(builder.getUIntNTy(1), numElts);
-      cmp = cir::VecCmpOp::create(builder, loc, integralVecTy, pred, ops[0], ops[1]);
+    switch (imm) {
+    default:
+      llvm_unreachable("Unknown condition code");
+    case 0:
+      pred = cir::CmpOpKind::eq;
+      break;
+    case 1:
+      pred = cir::CmpOpKind::lt;
+      break;
+    case 2:
+      pred = cir::CmpOpKind::le;
+      break;
+    case 4:
+      pred = cir::CmpOpKind::ne;
+      break;
+    case 5:
+      pred = cir::CmpOpKind::ge;
+      break;
+    case 6:
+      pred = cir::CmpOpKind::gt;
+      break;
     }
+    cir::VectorType integralVecTy =
+        cir::VectorType::get(builder.getUIntNTy(1), numElts);
+    cmp = cir::VecCmpOp::create(builder, loc, integralVecTy, pred, ops[0],
+                                ops[1]);
+  }
   mlir::Value maskIn = nullptr;
   if (ops.size() == 4)
     maskIn = ops[3];
 
-  if (maskIn)
-  {
+  if (maskIn) {
     auto castOp = mlir::dyn_cast_or_null<cir::CastOp>(maskIn.getDefiningOp());
-    if (!castOp)
-    {
-      auto maskVec = getMaskVecValue(builder, loc, maskIn, numElts);
+    if (!castOp) {
+      mlir::Value maskVec = getMaskVecValue(builder, loc, maskIn, numElts);
       cmp = builder.createAnd(loc, cmp, maskVec);
     }
   }
-  if (numElts < 8)
-  {
+  if (numElts < 8) {
     mlir::Type i32Ty = builder.getSInt32Ty();
     SmallVector<mlir::Attribute, 8> indices;
     for (unsigned i = 0; i != numElts; ++i)
       indices.push_back(cir::IntAttr::get(i32Ty, i));
     for (unsigned i = numElts; i != 8; ++i)
       indices.push_back(cir::IntAttr::get(i32Ty, i % numElts + numElts));
-    cmp = builder.createVecShuffle(loc, cmp, builder.getNullValue(cmp.getType(), loc), indices);
+    cmp = builder.createVecShuffle(
+        loc, cmp, builder.getNullValue(cmp.getType(), loc), indices);
   }
-  auto result = builder.createVectorToIntCast(
+  mlir::Value result = builder.createVectorToIntCast(
       loc, cmp, builder.getUIntNTy(std::max(numElts, 8U)));
   return result;
-  }
+}
 
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index f6c54ee210d91..5ee26238b8184 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -657,8 +657,7 @@ LogicalResult cir::CastOp::verify() {
     return success();
   }
   case cir::CastKind::vector_to_int: {
-    auto vectorTy = mlir::dyn_cast<cir::VectorType>(srcType);
-    if (!vectorTy)
+    if (!mlir::isa<cir::VectorType>(srcType))
       return emitOpError() << "requires !cir.vector type for source";
     if (!mlir::isa<cir::IntType>(resType))
       return emitOpError() << "requires !cir.int type for result";
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 6d550228af27d..62d8184c79568 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1339,18 +1339,19 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite(
     mlir::Type dstType = castOp.getType();
     mlir::Value llvmSrcVal = adaptor.getSrc();
     cir::IntType srcIntType =
-    mlir::cast<cir::IntType>(elementTypeIfVector(srcType));
+        mlir::cast<cir::IntType>(elementTypeIfVector(srcType));
     cir::IntType dstIntType =
         mlir::cast<cir::IntType>(elementTypeIfVector(dstType));
-    uint64_t numElements =
-        mlir::cast<cir::VectorType>(srcType).getSize();
+    uint64_t numElements = mlir::cast<cir::VectorType>(srcType).getSize();
 
-    auto width = numElements * srcIntType.getWidth();
-    auto convertIntTypeOp = rewriter.create<mlir::LLVM::BitcastOp>(castOp.getLoc(), rewriter.getIntegerType(width),
-                                                       llvmSrcVal);
+    unsigned width = numElements * srcIntType.getWidth();
+    auto convertIntTypeOp = rewriter.create<mlir::LLVM::BitcastOp>(
+        castOp.getLoc(), rewriter.getIntegerType(width), llvmSrcVal);
     // truncate to the destination integer type
-    auto dstWidth = dstIntType.getWidth();
-    auto srcWidth = mlir::cast<mlir::IntegerType>(convertIntTypeOp.getResult().getType()).getWidth();
+    unsigned dstWidth = dstIntType.getWidth();
+    unsigned srcWidth =
+        mlir::cast<mlir::IntegerType>(convertIntTypeOp.getResult().getType())
+            .getWidth();
     auto truncOp = rewriter.create<mlir::LLVM::TruncOp>(
         convertIntTypeOp.getLoc(), rewriter.getIntegerType(dstWidth),
         convertIntTypeOp.getResult());
@@ -3612,11 +3613,9 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite(
 
   // LLVM IR vector comparison returns a vector of i1. This one-bit vector
   // must be sign-extended to the correct result type.
-  auto vecElementType = elementTypeIfVector(op.getType());
-  if (auto intType = mlir::dyn_cast<cir::IntType>(vecElementType))
-  {
-    if (intType.getWidth() == 1)
-    {
+  mlir::Type vecElementType = elementTypeIfVector(op.getType());
+  if (auto intType = mlir::dyn_cast<cir::IntType>(vecElementType)) {
+    if (intType.getWidth() == 1) {
       rewriter.replaceOp(op, bitResult);
       return mlir::success();
     }

>From bcd10885462de1b1405684c8161db0a7bde610f1 Mon Sep 17 00:00:00 2001
From: Zhihui Yang <youngwisdm at gmail.com>
Date: Thu, 8 Jan 2026 05:03:58 -0800
Subject: [PATCH 3/3] [task] fix the comments

---
 clang/include/clang/CIR/Dialect/IR/CIROps.td | 147 ++++++++++---------
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp   |  43 +++---
 2 files changed, 94 insertions(+), 96 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2dd0dc3416b57..c2be3359e3af2 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -105,79 +105,80 @@ class HasAtMostOneOfAttrs<list<string> names> : PredOpTrait<
 // CastOp
 //===----------------------------------------------------------------------===//
 
-def CIR_CastKind
-    : CIR_I32EnumAttr<
-          "CastKind", "cast kind",
-          [I32EnumAttrCase<"bitcast", 1>,
-           // CK_LValueBitCast
-           // CK_LValueToRValueBitCast
-           // CK_LValueToRValue
-           // CK_NoOp
-           // CK_BaseToDerived
-           // CK_DerivedToBase
-           // CK_UncheckedDerivedToBase
-           // CK_Dynamic
-           // CK_ToUnion
-           I32EnumAttrCase<"array_to_ptrdecay", 11>,
-           // CK_FunctionToPointerDecay
-           // CK_NullToPointer
-           // CK_NullToMemberPointer
-           // CK_BaseToDerivedMemberPointer
-           // CK_DerivedToBaseMemberPointer
-           I32EnumAttrCase<"member_ptr_to_bool", 17>,
-           // CK_ReinterpretMemberPointer
-           // CK_UserDefinedConversion
-           // CK_ConstructorConversion
-           I32EnumAttrCase<"int_to_ptr", 21>, I32EnumAttrCase<"ptr_to_int", 22>,
-           I32EnumAttrCase<"ptr_to_bool", 23>,
-           // CK_ToVoid
-           // CK_MatrixCast
-           // CK_VectorSplat
-           I32EnumAttrCase<"integral", 27>, I32EnumAttrCase<"int_to_bool", 28>,
-           I32EnumAttrCase<"int_to_float", 29>,
-           // CK_FloatingToFixedPoint
-           // CK_FixedPointToFloating
-           // CK_FixedPointCast
-           // CK_FixedPointToIntegral
-           // CK_IntegralToFixedPoint
-           // CK_FixedPointToBoolean
-           I32EnumAttrCase<"float_to_int", 36>,
-           I32EnumAttrCase<"float_to_bool", 37>,
-           I32EnumAttrCase<"bool_to_int", 38>, I32EnumAttrCase<"floating", 39>,
-           // CK_CPointerToObjCPointerCast
-           // CK_BlockPointerToObjCPointerCast
-           // CK_AnyPointerToBlockPointerCast
-           // CK_ObjCObjectLValueCast
-           I32EnumAttrCase<"float_to_complex", 44>,
-           I32EnumAttrCase<"float_complex_to_real", 45>,
-           I32EnumAttrCase<"float_complex_to_bool", 46>,
-           I32EnumAttrCase<"float_complex", 47>,
-           I32EnumAttrCase<"float_complex_to_int_complex", 48>,
-           I32EnumAttrCase<"int_to_complex", 49>,
-           I32EnumAttrCase<"int_complex_to_real", 50>,
-           I32EnumAttrCase<"int_complex_to_bool", 51>,
-           I32EnumAttrCase<"int_complex", 52>,
-           I32EnumAttrCase<"int_complex_to_float_complex", 53>,
-           // CK_ARCProduceObject
-           // CK_ARCConsumeObject
-           // CK_ARCReclaimReturnedObject
-           // CK_ARCExtendBlockObject
-           // CK_AtomicToNonAtomic
-           // CK_NonAtomicToAtomic
-           // CK_CopyAndAutoreleaseBlockObject
-           // CK_BuiltinFnToFnPtr
-           // CK_ZeroToOCLOpaqueType
-           I32EnumAttrCase<"address_space", 63>,
-           // CK_IntToOCLSampler
-           // CK_HLSLVectorTruncation
-           // CK_HLSLArrayRValue
-           // CK_HLSLElementwiseCast
-           // CK_HLSLAggregateSplatCast
-
-           // Enums below are specific to CIR and don't have a correspondence to
-           // classic codegen:
-           I32EnumAttrCase<"vector_to_int", 999>,
-           I32EnumAttrCase<"bool_to_float", 1000>,
+def CIR_CastKind : CIR_I32EnumAttr<"CastKind", "cast kind", [
+  I32EnumAttrCase<"bitcast", 1>,
+  // CK_LValueBitCast
+  // CK_LValueToRValueBitCast
+  // CK_LValueToRValue
+  // CK_NoOp
+  // CK_BaseToDerived
+  // CK_DerivedToBase
+  // CK_UncheckedDerivedToBase
+  // CK_Dynamic
+  // CK_ToUnion
+  I32EnumAttrCase<"array_to_ptrdecay", 11>,
+  // CK_FunctionToPointerDecay
+  // CK_NullToPointer
+  // CK_NullToMemberPointer
+  // CK_BaseToDerivedMemberPointer
+  // CK_DerivedToBaseMemberPointer
+  I32EnumAttrCase<"member_ptr_to_bool", 17>,
+  // CK_ReinterpretMemberPointer
+  // CK_UserDefinedConversion
+  // CK_ConstructorConversion
+  I32EnumAttrCase<"int_to_ptr", 21>,
+  I32EnumAttrCase<"ptr_to_int", 22>,
+  I32EnumAttrCase<"ptr_to_bool", 23>,
+  // CK_ToVoid
+  // CK_MatrixCast
+  // CK_VectorSplat
+  I32EnumAttrCase<"integral", 27>,
+  I32EnumAttrCase<"int_to_bool", 28>,
+  I32EnumAttrCase<"int_to_float", 29>,
+  // CK_FloatingToFixedPoint
+  // CK_FixedPointToFloating
+  // CK_FixedPointCast
+  // CK_FixedPointToIntegral
+  // CK_IntegralToFixedPoint
+  // CK_FixedPointToBoolean
+  I32EnumAttrCase<"float_to_int", 36>,
+  I32EnumAttrCase<"float_to_bool", 37>,
+  I32EnumAttrCase<"bool_to_int", 38>,
+  I32EnumAttrCase<"floating", 39>,
+  // CK_CPointerToObjCPointerCast
+  // CK_BlockPointerToObjCPointerCast
+  // CK_AnyPointerToBlockPointerCast
+  // CK_ObjCObjectLValueCast
+  I32EnumAttrCase<"float_to_complex", 44>,
+  I32EnumAttrCase<"float_complex_to_real", 45>,
+  I32EnumAttrCase<"float_complex_to_bool", 46>,
+  I32EnumAttrCase<"float_complex", 47>,
+  I32EnumAttrCase<"float_complex_to_int_complex", 48>,
+  I32EnumAttrCase<"int_to_complex", 49>,
+  I32EnumAttrCase<"int_complex_to_real", 50>,
+  I32EnumAttrCase<"int_complex_to_bool", 51>,
+  I32EnumAttrCase<"int_complex", 52>,
+  I32EnumAttrCase<"int_complex_to_float_complex", 53>,
+  // CK_ARCProduceObject
+  // CK_ARCConsumeObject
+  // CK_ARCReclaimReturnedObject
+  // CK_ARCExtendBlockObject
+  // CK_AtomicToNonAtomic
+  // CK_NonAtomicToAtomic
+  // CK_CopyAndAutoreleaseBlockObject
+  // CK_BuiltinFnToFnPtr
+  // CK_ZeroToOCLOpaqueType
+  I32EnumAttrCase<"address_space", 63>,
+  // CK_IntToOCLSampler
+  // CK_HLSLVectorTruncation
+  // CK_HLSLArrayRValue
+  // CK_HLSLElementwiseCast
+  // CK_HLSLAggregateSplatCast
+
+  // Enums below are specific to CIR and don't have a correspondence to classic
+  // codegen:
+  I32EnumAttrCase<"vector_to_int", 999>,
+  I32EnumAttrCase<"bool_to_float", 1000>,
 ]>;
 
 def CIR_CastOp : CIR_Op<"cast", [
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 65f9f1389c478..ae696215163f0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -417,6 +417,25 @@ static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,
   return builder.createVecCompare(loc, pred, op0, op1);
 }
 
+static cir::CmpOpKind getCmpPredFromImm(unsigned imm) {
+  switch (imm) {
+  case 0:
+    return cir::CmpOpKind::eq;
+  case 1:
+    return cir::CmpOpKind::lt;
+  case 2:
+    return cir::CmpOpKind::le;
+  case 4:
+    return cir::CmpOpKind::ne;
+  case 5:
+    return cir::CmpOpKind::ge;
+  case 6:
+    return cir::CmpOpKind::gt;
+  default:
+    llvm_unreachable("Unknown condition code");
+  }
+}
+
 static mlir::Value emitX86MaskedCompare(CIRGenBuilderTy &builder,
                                         mlir::Location loc,
                                         llvm::SmallVector<mlir::Value> ops,
@@ -435,29 +454,7 @@ static mlir::Value emitX86MaskedCompare(CIRGenBuilderTy &builder,
     cmp = cir::VecSplatOp::create(
         builder, loc, ty, builder.getConstAPInt(loc, elementTy, allOnes));
   } else {
-    cir::CmpOpKind pred;
-    switch (imm) {
-    default:
-      llvm_unreachable("Unknown condition code");
-    case 0:
-      pred = cir::CmpOpKind::eq;
-      break;
-    case 1:
-      pred = cir::CmpOpKind::lt;
-      break;
-    case 2:
-      pred = cir::CmpOpKind::le;
-      break;
-    case 4:
-      pred = cir::CmpOpKind::ne;
-      break;
-    case 5:
-      pred = cir::CmpOpKind::ge;
-      break;
-    case 6:
-      pred = cir::CmpOpKind::gt;
-      break;
-    }
+    cir::CmpOpKind pred = getCmpPredFromImm(imm);
     cir::VectorType integralVecTy =
         cir::VectorType::get(builder.getUIntNTy(1), numElts);
     cmp = cir::VecCmpOp::create(builder, loc, integralVecTy, pred, ops[0],