[clang] [CIR] X86 vector masked load builtins (PR #169464)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Feb 10 00:34:57 PST 2026
https://github.com/woruyu updated https://github.com/llvm/llvm-project/pull/169464
>From 6e9008193037db7048645edeb35653b5d3a66840 Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Tue, 10 Feb 2026 16:34:39 +0800
Subject: [PATCH] [CIR] X86 vector masked load builtins
---
clang/include/clang/CIR/Dialect/IR/CIROps.td | 47 ++++++++++++++++++
clang/lib/CIR/CodeGen/CIRGenBuilder.h | 16 ++++++
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 27 ++++++++++
.../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 20 ++++++++
.../X86/avx512vl-builtins-test.c | 49 +++++++++++++++++++
.../CodeGenBuiltins/X86/avx512vl-builtins.c | 40 +++++++++++++++
6 files changed, 199 insertions(+)
create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins-test.c
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 6cebf6e62af6f..770b2c76c7fb9 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -665,6 +665,53 @@ def CIR_LoadOp : CIR_Op<"load", [
// FIXME: add verifier.
}
+//===----------------------------------------------------------------------===//
+// MaskLoadOp
+//===----------------------------------------------------------------------===//
+
+def CIR_MaskedLoadOp : CIR_Op<"mask.load", [
+ TypesMatchWith<"type of 'result' matches pointee type of 'addr'",
+ "addr", "result", "mlir::cast<cir::PointerType>($_self).getPointee()">
+]> {
+ let summary = "Masked vector load from memory";
+ let description = [{
+ `cir.masked_load` conditionally loads elements from memory based on a mask.
+ Elements for which the mask is false are taken from `pass_thru`.
+
+ This operation is intended to correspond closely to LLVM's masked load op
+ (`llvm.intr.maskedload` / `LLVM::MaskedLoadOp`) and lower directly to it.
+
+ `alignment` can be provided to override the default alignment derived from
+ the pointee/element type data layout.
+
+ Example:
+
+ ```mlir
+ %v = cir.masked_load align(16) %ptr, %mask, %passthru
+ : !cir.ptr<i32>, <4xi1>, <4xi32> -> <4xi32>
+ ```
+ }];
+
+ let arguments = (ins
+ Arg<CIR_PointerType, "base address (points to element type)", [MemRead]>:$addr,
+ CIR_VectorType:$mask,
+ CIR_VectorType:$pass_thru,
+ OptionalAttr<IntValidAlignment<I64Attr>>:$alignment
+ );
+
+ let results = (outs CIR_AnyType:$result);
+
+ let assemblyFormat = [{
+ (`align` `(` $alignment^ `)`)?
+ $addr `,` $mask `,` $pass_thru
+ `:` qualified(type($addr)) `,` type($mask) `,` type($pass_thru)
+ `->` type($result)
+ attr-dict
+ }];
+
+ // FIXME: add verifier
+}
+
//===----------------------------------------------------------------------===//
// StoreOp
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index dedb369bf3f67..00a86377f8c99 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -654,6 +654,22 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
addr.getAlignment().getAsAlign().value());
}
+ mlir::Value createMaskedLoad(mlir::Location loc, mlir::Type ty,
+ mlir::Value ptr, llvm::Align alignment,
+ mlir::Value mask, mlir::Value passThru) {
+ assert(mlir::isa<cir::VectorType>(ty) && "Type should be vector");
+ assert(mask && "Mask should not be all-ones (null)");
+
+ if (!passThru)
+ passThru = this->getConstant(loc, cir::PoisonAttr::get(ty));
+
+ auto alignAttr =
+ this->getI64IntegerAttr(static_cast<int64_t>(alignment.value()));
+
+ return cir::MaskedLoadOp::create(*this, loc, ty, ptr, mask, passThru,
+ alignAttr);
+ }
+
cir::VecShuffleOp
createVecShuffle(mlir::Location loc, mlir::Value vec1, mlir::Value vec2,
llvm::ArrayRef<mlir::Attribute> maskAttrs) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 80022998448ad..0563e5dbea3a9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -759,6 +759,18 @@ static mlir::Value emitX86Aeswide(CIRGenBuilderTy &builder, mlir::Location loc,
return cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
}
+static mlir::Value emitX86MaskedLoad(CIRGenBuilderTy &builder,
+ ArrayRef<mlir::Value> ops,
+ llvm::Align alignment,
+ mlir::Location loc) {
+ mlir::Type ty = ops[1].getType();
+ mlir::Value ptr = ops[0];
+ mlir::Value maskVec = getMaskVecValue(builder, loc, ops[2],
+ cast<cir::VectorType>(ty).getSize());
+
+ return builder.createMaskedLoad(loc, ty, ptr, alignment, maskVec, ops[1]);
+}
+
std::optional<mlir::Value>
CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -1112,6 +1124,11 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
case X86::BI__builtin_ia32_movdqa64store512_mask:
case X86::BI__builtin_ia32_storeaps512_mask:
case X86::BI__builtin_ia32_storeapd512_mask:
+ cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ return {};
+
case X86::BI__builtin_ia32_loadups128_mask:
case X86::BI__builtin_ia32_loadups256_mask:
case X86::BI__builtin_ia32_loadups512_mask:
@@ -1134,6 +1151,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
case X86::BI__builtin_ia32_loadsh128_mask:
case X86::BI__builtin_ia32_loadss128_mask:
case X86::BI__builtin_ia32_loadsd128_mask:
+ return emitX86MaskedLoad(builder, ops, llvm::Align(1),
+ getLoc(expr->getExprLoc()));
+
case X86::BI__builtin_ia32_loadaps128_mask:
case X86::BI__builtin_ia32_loadaps256_mask:
case X86::BI__builtin_ia32_loadaps512_mask:
@@ -1146,6 +1166,13 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
case X86::BI__builtin_ia32_movdqa64load128_mask:
case X86::BI__builtin_ia32_movdqa64load256_mask:
case X86::BI__builtin_ia32_movdqa64load512_mask:
+ return emitX86MaskedLoad(
+ builder, ops,
+ getContext()
+ .getTypeAlignInChars(expr->getArg(1)->getType())
+ .getAsAlign(),
+ getLoc(expr->getExprLoc()));
+
case X86::BI__builtin_ia32_expandloaddf128_mask:
case X86::BI__builtin_ia32_expandloaddf256_mask:
case X86::BI__builtin_ia32_expandloaddf512_mask:
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 0e50d9c595564..444a7c15396b5 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1796,6 +1796,26 @@ mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite(
return mlir::LogicalResult::success();
}
+mlir::LogicalResult cir::direct::CIRToLLVMMaskedLoadOpLowering::matchAndRewrite(
+ cir::MaskedLoadOp op, OpAdaptor adaptor,
+ mlir::ConversionPatternRewriter &rewriter) const {
+ const mlir::Type llvmResTy =
+ convertTypeForMemory(*getTypeConverter(), dataLayout, op.getType());
+
+ std::optional<size_t> opAlign = op.getAlignment();
+ unsigned alignment =
+ (unsigned)opAlign.value_or(dataLayout.getTypeABIAlignment(llvmResTy));
+
+ auto alignAttr = rewriter.getI32IntegerAttr(alignment);
+
+ mlir::LLVM::MaskedLoadOp newLoad = mlir::LLVM::MaskedLoadOp::create(
+ rewriter, op.getLoc(), llvmResTy, adaptor.getAddr(), adaptor.getMask(),
+ adaptor.getPassThru(), alignAttr);
+
+ rewriter.replaceOp(op, newLoad.getResult());
+ return mlir::success();
+}
+
mlir::LogicalResult CIRToLLVMStoreOpLowering::matchAndRewrite(
cir::StoreOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins-test.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins-test.c
new file mode 100644
index 0000000000000..8525af17ddb90
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins-test.c
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=OGCG --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+__m128 test_mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
+ // CIR-LABEL: _mm_mask_loadu_ps
+ // CIR: cir.mask.load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+
+ // LLVM-LABEL: test_mm_mask_loadu_ps
+ // LLVM: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // LLVM: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+
+ // OGCG-LABEL: test_mm_mask_loadu_ps
+ // OGCG: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // OGCG: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+ return _mm_mask_loadu_ps(__W, __U, __P);
+}
+
+__m128 test_mm_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+ // CIR-LABEL: _mm_maskz_loadu_ps
+ // CIR: cir.mask.load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+
+ // LLVM-LABEL: test_mm_maskz_loadu_ps
+ // LLVM: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // LLVM: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+
+ // OGCG-LABEL: test_mm_maskz_loadu_ps
+ // OGCG: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // OGCG: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+ return _mm_maskz_loadu_ps(__U, __P);
+}
+
+__m256 test_mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
+ // CIR-LABEL: _mm256_mask_loadu_ps
+ // CIR: cir.mask.load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !cir.float>>, <8 x !cir.int<s, 1>>, <8 x !cir.float> -> !cir.vector<8 x !cir.float>
+
+ // LLVM-LABEL: test_mm256_mask_loadu_ps
+ // LLVM: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+
+ // OGCG-LABEL: @test_mm256_mask_loadu_ps
+ // OGCG: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+ return _mm256_mask_loadu_ps(__W, __U, __P);
+}
+
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
index f03fc75565b1a..3926062e9efda 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
@@ -363,3 +363,43 @@ __m256i test_mm256_shuffle_i64x2(__m256i a, __m256i b) {
// OGCG: shufflevector <4 x i64> %{{.+}}, <4 x i64> %{{.+}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
return _mm256_shuffle_i64x2(a, b, 0x03);
}
+
+__m128 test_mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
+ // CIR-LABEL: _mm_mask_loadu_ps
+ // CIR: cir.mask.load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+
+ // LLVM-LABEL: test_mm_mask_loadu_ps
+ // LLVM: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // LLVM: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+
+ // OGCG-LABEL: test_mm_mask_loadu_ps
+ // OGCG: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // OGCG: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+ return _mm_mask_loadu_ps(__W, __U, __P);
+}
+
+__m128 test_mm_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+ // CIR-LABEL: _mm_maskz_loadu_ps
+ // CIR: cir.mask.load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>>, <4 x !cir.int<s, 1>>, <4 x !cir.float> -> !cir.vector<4 x !cir.float>
+
+ // LLVM-LABEL: test_mm_maskz_loadu_ps
+ // LLVM: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // LLVM: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+
+ // OGCG-LABEL: test_mm_maskz_loadu_ps
+ // OGCG: [[MASK4:%.*]] = shufflevector <8 x i1> %{{.+}}, <8 x i1> %{{.+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // OGCG: call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.+}}, <4 x i1> [[MASK4]], <4 x float> %{{.+}})
+ return _mm_maskz_loadu_ps(__U, __P);
+}
+
+__m256 test_mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
+ // CIR-LABEL: _mm256_mask_loadu_ps
+ // CIR: cir.mask.load align(1) %{{.*}}, %{{.*}}, %{{.*}} : !cir.ptr<!cir.vector<8 x !cir.float>>, <8 x !cir.int<s, 1>>, <8 x !cir.float> -> !cir.vector<8 x !cir.float>
+
+ // LLVM-LABEL: test_mm256_mask_loadu_ps
+ // LLVM: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+
+ // OGCG-LABEL: @test_mm256_mask_loadu_ps
+ // OGCG: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+ return _mm256_mask_loadu_ps(__W, __U, __P);
+}
More information about the cfe-commits
mailing list