[clang] [CIR] X86 vector masked load builtins (PR #169464)

Wed Feb 11 05:03:16 PST 2026

https://github.com/woruyu updated https://github.com/llvm/llvm-project/pull/169464

>From f54ee2d3a4c3d112a12b87b4e9a0114efeaedc5c Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Wed, 11 Feb 2026 21:02:16 +0800
Subject: [PATCH] fix: review and add all testcases

---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  45 +++
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  16 +
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  27 ++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  21 +
 .../CodeGenBuiltins/X86/avx512vl-builtins.c   | 364 ++++++++++++++++++
 5 files changed, 473 insertions(+)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 6cebf6e62af6f..77184225b597a 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -665,6 +665,51 @@ def CIR_LoadOp : CIR_Op<"load", [
   // FIXME: add verifier.
 }
 
+//===----------------------------------------------------------------------===//
+// MaskLoadOp
+//===----------------------------------------------------------------------===//
+
+def CIR_VecMaskedLoadOp : CIR_Op<"vec.masked_load", [
+  TypesMatchWith<"type of 'result' must match pointee type of 'addr'",
+    "addr", "result", "mlir::cast<cir::PointerType>($_self).getPointee()">
+]> {
+  let summary = "Masked vector load from memory";
+  let description = [{
+    `cir.masked_load` conditionally loads elements from memory based on a mask.
+    Elements for which the mask is false are taken from `pass_thru`.
+
+    This operation corresponds to LLVM's masked load op (`llvm.intr.maskedload`)
+    and lower directly to it.
+
+    `alignment` can be provided to override the default alignment derived from
+    the pointee/element type data layout.
+
+    Example:
+
+    ```mlir
+    %v = cir.masked_load align(16) %ptr, %mask, %passthru
+         : !cir.ptr<i32>, <4xi1>, <4xi32> -> <4xi32>
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<CIR_PointerType, "base address (points to element type)", [MemRead]>:$addr,
+    CIR_VectorType:$mask,
+    CIR_VectorType:$pass_thru,
+    OptionalAttr<IntValidAlignment<I64Attr>>:$alignment
+  );
+
+  let results = (outs CIR_AnyType:$result);
+
+  let assemblyFormat = [{
+    (`align` `(` $alignment^ `)`)? 
+    $addr `,` $mask `,` $pass_thru
+    `:` qualified(type($addr)) `,` type($mask) `,` type($pass_thru)
+    `->` type($result)
+    attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // StoreOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index dedb369bf3f67..031ec98296d60 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -654,6 +654,22 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
                                       addr.getAlignment().getAsAlign().value());
   }
 
+  mlir::Value createMaskedLoad(mlir::Location loc, mlir::Type ty,
+                               mlir::Value ptr, llvm::Align alignment,
+                               mlir::Value mask, mlir::Value passThru) {
+    assert(mlir::isa<cir::VectorType>(ty) && "Type should be vector");
+    assert(mask && "Mask should not be all-ones (null)");
+
+    if (!passThru)
+      passThru = this->getConstant(loc, cir::PoisonAttr::get(ty));
+
+    auto alignAttr =
+        this->getI64IntegerAttr(static_cast<int64_t>(alignment.value()));
+
+    return cir::VecMaskedLoadOp::create(*this, loc, ty, ptr, mask, passThru,
+                                        alignAttr);
+  }
+
   cir::VecShuffleOp
   createVecShuffle(mlir::Location loc, mlir::Value vec1, mlir::Value vec2,
                    llvm::ArrayRef<mlir::Attribute> maskAttrs) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 80022998448ad..0563e5dbea3a9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -759,6 +759,18 @@ static mlir::Value emitX86Aeswide(CIRGenBuilderTy &builder, mlir::Location loc,
   return cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
 }
 
+static mlir::Value emitX86MaskedLoad(CIRGenBuilderTy &builder,
+                                     ArrayRef<mlir::Value> ops,
+                                     llvm::Align alignment,
+                                     mlir::Location loc) {
+  mlir::Type ty = ops[1].getType();
+  mlir::Value ptr = ops[0];
+  mlir::Value maskVec = getMaskVecValue(builder, loc, ops[2],
+                                        cast<cir::VectorType>(ty).getSize());
+
+  return builder.createMaskedLoad(loc, ty, ptr, alignment, maskVec, ops[1]);
+}
+
 std::optional<mlir::Value>
 CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -1112,6 +1124,11 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_movdqa64store512_mask:
   case X86::BI__builtin_ia32_storeaps512_mask:
   case X86::BI__builtin_ia32_storeapd512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+
   case X86::BI__builtin_ia32_loadups128_mask:
   case X86::BI__builtin_ia32_loadups256_mask:
   case X86::BI__builtin_ia32_loadups512_mask:
@@ -1134,6 +1151,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_loadsh128_mask:
   case X86::BI__builtin_ia32_loadss128_mask:
   case X86::BI__builtin_ia32_loadsd128_mask:
+    return emitX86MaskedLoad(builder, ops, llvm::Align(1),
+                             getLoc(expr->getExprLoc()));
+
   case X86::BI__builtin_ia32_loadaps128_mask:
   case X86::BI__builtin_ia32_loadaps256_mask:
   case X86::BI__builtin_ia32_loadaps512_mask:
@@ -1146,6 +1166,13 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_movdqa64load128_mask:
   case X86::BI__builtin_ia32_movdqa64load256_mask:
   case X86::BI__builtin_ia32_movdqa64load512_mask:
+    return emitX86MaskedLoad(
+        builder, ops,
+        getContext()
+            .getTypeAlignInChars(expr->getArg(1)->getType())
+            .getAsAlign(),
+        getLoc(expr->getExprLoc()));
+
   case X86::BI__builtin_ia32_expandloaddf128_mask:
   case X86::BI__builtin_ia32_expandloaddf256_mask:
   case X86::BI__builtin_ia32_expandloaddf512_mask:
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 0e50d9c595564..ae82eb9fae983 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1796,6 +1796,27 @@ mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite(
   return mlir::LogicalResult::success();
 }
 
+mlir::LogicalResult
+cir::direct::CIRToLLVMVecMaskedLoadOpLowering::matchAndRewrite(
+    cir::VecMaskedLoadOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  const mlir::Type llvmResTy =
+      convertTypeForMemory(*getTypeConverter(), dataLayout, op.getType());
+
+  std::optional<size_t> opAlign = op.getAlignment();
+  unsigned alignment =
+      (unsigned)opAlign.value_or(dataLayout.getTypeABIAlignment(llvmResTy));
+
+  mlir::IntegerAttr alignAttr = rewriter.getI32IntegerAttr(alignment);
+
+  auto newLoad = mlir::LLVM::MaskedLoadOp::create(
+      rewriter, op.getLoc(), llvmResTy, adaptor.getAddr(), adaptor.getMask(),
+      adaptor.getPassThru(), alignAttr);
+
+  rewriter.replaceOp(op, newLoad.getResult());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMStoreOpLowering::matchAndRewrite(
     cir::StoreOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
index f03fc75565b1a..e3cbc0fc10524 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
@@ -363,3 +363,367 @@ __m256i test_mm256_shuffle_i64x2(__m256i a, __m256i b) {
   // OGCG: shufflevector <4 x i64> %{{.+}}, <4 x i64> %{{.+}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   return _mm256_shuffle_i64x2(a, b, 0x03);
 }
+
+__m128 test_mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_mask_loadu_ps
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+                                                           
+  // LLVM-LABEL: test_mm_mask_loadu_ps
+  // LLVM: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+
+  // OGCG-LABEL: test_mm_mask_loadu_ps
+  // OGCG: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // OGCG: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+  return _mm_mask_loadu_ps(__W, __U, __P);
+}
+
+__m128 test_mm_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_loadu_ps
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+
+  // LLVM-LABEL: test_mm_maskz_loadu_ps
+  // LLVM: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+
+  // OGCG-LABEL: test_mm_maskz_loadu_ps
+  // OGCG: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // OGCG: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+  return _mm_maskz_loadu_ps(__U, __P); 
+}
+
+__m256 test_mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_mask_loadu_ps
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !cir.float>>, <8 x !cir.int<s, 1>>, <8 x !cir.float> -> !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: test_mm256_mask_loadu_ps
+  // LLVM: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_mask_loadu_ps
+  // OGCG: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_mask_loadu_ps(__W, __U, __P); 
+}
+
+__m256 test_mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_loadu_ps
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !cir.float>>, <8 x !cir.int<s, 1>>, <8 x !cir.float> -> !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: test_mm256_maskz_loadu_ps
+  // LLVM: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_loadu_ps
+  // OGCG: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_maskz_loadu_ps(__U, __P); 
+}
+
+__m256d test_mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_mask_loadu_pd
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.double>>, <4 x !cir.int<s, 1>>, <4 x !cir.double> -> !cir.vector<4 x !cir.double>
+  
+  // LLVM-LABEL: @test_mm256_mask_loadu_pd
+  // LLVM: @llvm.masked.load.v4f64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_mask_loadu_pd
+  // OGCG: @llvm.masked.load.v4f64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_mask_loadu_pd(__W, __U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_mask_loadu_epi32
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s32i>>, <4 x !cir.int<s, 1>>, <4 x !s32i> -> !cir.vector<4 x !s32i>
+
+  // LLVM-LABEL: @test_mm_mask_loadu_epi32
+  // LLVM: @llvm.masked.load.v4i32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_mask_loadu_epi32
+  // OGCG: @llvm.masked.load.v4i32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_mask_loadu_epi32(__W, __U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_mask_loadu_epi32
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !s32i>>, <8 x !cir.int<s, 1>>, <8 x !s32i> -> !cir.vector<8 x !s32i>
+
+  // LLVM-LABEL: @test_mm256_mask_loadu_epi32
+  // LLVM: @llvm.masked.load.v8i32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_mask_loadu_epi32
+  // OGCG: @llvm.masked.load.v8i32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mask_loadu_epi32(__W, __U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_mask_loadu_epi64
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<2 x !s64i>>, <2 x !cir.int<s, 1>>, <2 x !s64i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: @test_mm_mask_loadu_epi64
+  // LLVM: @llvm.masked.load.v2i64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_mask_loadu_epi64
+  // OGCG: @llvm.masked.load.v2i64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_mask_loadu_epi64(__W, __U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_mask_loadu_epi64
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s64i>>, <4 x !cir.int<s, 1>>, <4 x !s64i> -> !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: @test_mm256_mask_loadu_epi64
+  // LLVM: @llvm.masked.load.v4i64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_mask_loadu_epi64
+  // OGCG: @llvm.masked.load.v4i64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_mask_loadu_epi64(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_loadu_epi64
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s64i>>, <4 x !cir.int<s, 1>>, <4 x !s64i> -> !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: @test_mm256_maskz_loadu_epi64
+  // LLVM: @llvm.masked.load.v4i64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_loadu_epi64
+  // OGCG: @llvm.masked.load.v4i64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskz_loadu_epi64(__U, __P); 
+}
+
+__m128 test_mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_mask_load_ps
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+
+  // LLVM-LABEL: @test_mm_mask_load_ps
+  // LLVM: @llvm.masked.load.v4f32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_mask_load_ps
+  // OGCG: @llvm.masked.load.v4f32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_mask_load_ps(__W, __U, __P); 
+}
+
+__m128 test_mm_maskz_load_ps(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_load_ps
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+
+  // LLVM-LABEL: @test_mm_maskz_load_ps
+  // LLVM: @llvm.masked.load.v4f32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_maskz_load_ps
+  // OGCG: @llvm.masked.load.v4f32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_maskz_load_ps(__U, __P); 
+}
+
+__m256 test_mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_mask_load_ps
+  // CIR: cir.vec.masked_load align(32) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !cir.float>>, <8 x !cir.int<s, 1>>, <8 x !cir.float> -> !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: @test_mm256_mask_load_ps
+  // LLVM: @llvm.masked.load.v8f32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_mask_load_ps
+  // OGCG: @llvm.masked.load.v8f32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_mask_load_ps(__W, __U, __P); 
+}
+
+__m256 test_mm256_maskz_load_ps(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_load_ps
+  // CIR: cir.vec.masked_load align(32) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !cir.float>>, <8 x !cir.int<s, 1>>, <8 x !cir.float> -> !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: @test_mm256_maskz_load_ps
+  // LLVM: @llvm.masked.load.v8f32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_load_ps
+  // OGCG: @llvm.masked.load.v8f32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_maskz_load_ps(__U, __P); 
+}
+
+__m128d test_mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_mask_load_pd
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<2 x !cir.double>>, <2 x !cir.int<s, 1>>, <2 x !cir.double> -> !cir.vector<2 x !cir.double>
+
+  // LLVM-LABEL: @test_mm_mask_load_pd
+  // LLVM: @llvm.masked.load.v2f64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_mask_load_pd
+  // OGCG: @llvm.masked.load.v2f64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_mask_load_pd(__W, __U, __P); 
+}
+
+__m128d test_mm_maskz_load_pd(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_load_pd
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<2 x !cir.double>>, <2 x !cir.int<s, 1>>, <2 x !cir.double> -> !cir.vector<2 x !cir.double>
+
+  // LLVM-LABEL: @test_mm_maskz_load_pd
+  // LLVM: @llvm.masked.load.v2f64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_maskz_load_pd
+  // OGCG: @llvm.masked.load.v2f64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_maskz_load_pd(__U, __P); 
+}
+
+__m128d test_mm_maskz_loadu_pd(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_loadu_pd
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<2 x !cir.double>>, <2 x !cir.int<s, 1>>, <2 x !cir.double> -> !cir.vector<2 x !cir.double>
+
+  // LLVM-LABEL: @test_mm_maskz_loadu_pd
+  // LLVM: @llvm.masked.load.v2f64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_maskz_loadu_pd
+  // OGCG: @llvm.masked.load.v2f64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_maskz_loadu_pd(__U, __P); 
+}
+
+__m256d test_mm256_mask_load_pd(__m256d __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_mask_load_pd
+  // CIR: cir.vec.masked_load align(32) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.double>>, <4 x !cir.int<s, 1>>, <4 x !cir.double> -> !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: @test_mm256_mask_load_pd
+  // LLVM: @llvm.masked.load.v4f64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_mask_load_pd
+  // OGCG: @llvm.masked.load.v4f64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_mask_load_pd(__W, __U, __P); 
+}
+
+__m256d test_mm256_maskz_load_pd(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_load_pd
+  // CIR: cir.vec.masked_load align(32) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.double>>, <4 x !cir.int<s, 1>>, <4 x !cir.double> -> !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: @test_mm256_maskz_load_pd
+  // LLVM: @llvm.masked.load.v4f64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_load_pd
+  // OGCG: @llvm.masked.load.v4f64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_maskz_load_pd(__U, __P); 
+}
+
+__m256d test_mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_loadu_pd
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.double>>, <4 x !cir.int<s, 1>>, <4 x !cir.double> -> !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: @test_mm256_maskz_loadu_pd
+  // LLVM: @llvm.masked.load.v4f64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_loadu_pd
+  // OGCG: @llvm.masked.load.v4f64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_maskz_loadu_pd(__U, __P); 
+}
+
+__m128i test_mm_mask_load_epi32(__m128i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_mask_load_epi32
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s32i>>, <4 x !cir.int<s, 1>>, <4 x !s32i> -> !cir.vector<4 x !s32i>
+
+  // LLVM-LABEL: @test_mm_mask_load_epi32
+  // LLVM: @llvm.masked.load.v4i32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_mask_load_epi32
+  // OGCG: @llvm.masked.load.v4i32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_mask_load_epi32(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_load_epi32(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_load_epi32
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s32i>>, <4 x !cir.int<s, 1>>, <4 x !s32i> -> !cir.vector<4 x !s32i>
+
+  // LLVM-LABEL: @test_mm_maskz_load_epi32
+  // LLVM: @llvm.masked.load.v4i32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_maskz_load_epi32
+  // OGCG: @llvm.masked.load.v4i32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskz_load_epi32(__U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_loadu_epi32
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s32i>>, <4 x !cir.int<s, 1>>, <4 x !s32i> -> !cir.vector<4 x !s32i>
+
+  // LLVM-LABEL: @test_mm_maskz_loadu_epi32
+  // LLVM: @llvm.masked.load.v4i32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_maskz_loadu_epi32
+  // OGCG: @llvm.masked.load.v4i32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskz_loadu_epi32(__U, __P); 
+}
+
+__m256i test_mm256_maskz_load_epi32(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_load_epi32
+  // CIR: cir.vec.masked_load align(32) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !s32i>>, <8 x !cir.int<s, 1>>, <8 x !s32i> -> !cir.vector<8 x !s32i>
+
+  // LLVM-LABEL: @test_mm256_maskz_load_epi32
+  // LLVM: @llvm.masked.load.v8i32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_load_epi32
+  // OGCG: @llvm.masked.load.v8i32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskz_load_epi32(__U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_loadu_epi32
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !s32i>>, <8 x !cir.int<s, 1>>, <8 x !s32i> -> !cir.vector<8 x !s32i>
+
+  // LLVM-LABEL: @test_mm256_maskz_loadu_epi32
+  // LLVM: @llvm.masked.load.v8i32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_loadu_epi32
+  // OGCG: @llvm.masked.load.v8i32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskz_loadu_epi32(__U, __P); 
+}
+
+__m128i test_mm_mask_load_epi64(__m128i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_mask_load_epi64
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<2 x !s64i>>, <2 x !cir.int<s, 1>>, <2 x !s64i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: @test_mm_mask_load_epi64
+  // LLVM: @llvm.masked.load.v2i64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_mask_load_epi64
+  // OGCG: @llvm.masked.load.v2i64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})  
+  return _mm_mask_load_epi64(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_loadu_epi64
+  // CIR: cir.vec.masked_load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<2 x !s64i>>, <2 x !cir.int<s, 1>>, <2 x !s64i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: @test_mm_maskz_loadu_epi64
+  // LLVM: @llvm.masked.load.v2i64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_maskz_loadu_epi64
+  // OGCG: @llvm.masked.load.v2i64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})  
+  return _mm_maskz_loadu_epi64(__U, __P); 
+}
+
+__m128i test_mm_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm_maskz_load_epi64
+  // CIR: cir.vec.masked_load align(16) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<2 x !s64i>>, <2 x !cir.int<s, 1>>, <2 x !s64i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: @test_mm_maskz_load_epi64
+  // LLVM: @llvm.masked.load.v2i64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm_maskz_load_epi64
+  // OGCG: @llvm.masked.load.v2i64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})  
+  return _mm_maskz_load_epi64(__U, __P); 
+}
+
+__m256i test_mm256_mask_load_epi64(__m256i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_mask_load_epi64
+  // CIR: cir.vec.masked_load align(32) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s64i>>, <4 x !cir.int<s, 1>>, <4 x !s64i> -> !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: @test_mm256_mask_load_epi64
+  // LLVM: @llvm.masked.load.v4i64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_mask_load_epi64
+  // OGCG: @llvm.masked.load.v4i64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}}) 
+  return _mm256_mask_load_epi64(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm256_maskz_load_epi64
+  // CIR: cir.vec.masked_load align(32) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !s64i>>, <4 x !cir.int<s, 1>>, <4 x !s64i> -> !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: @test_mm256_maskz_load_epi64
+  // LLVM: @llvm.masked.load.v4i64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+
+  // OGCG-LABEL: @test_mm256_maskz_load_epi64
+  // OGCG: @llvm.masked.load.v4i64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})  
+  return _mm256_maskz_load_epi64(__U, __P); 
+}