[clang] [CIR][CIRGen][Builtin][X86] Masked compress Intrinsics (PR #169582)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Dec 5 03:22:52 PST 2025
https://github.com/cs25resch11005-bhuvan updated https://github.com/llvm/llvm-project/pull/169582
>From 961bd15f0f06624450d755ff53cefcb47ae9432d Mon Sep 17 00:00:00 2001
From: bhuvan1527 <balabhuvanvarma at gmail.com>
Date: Wed, 26 Nov 2025 05:11:22 +0530
Subject: [PATCH 1/2] [CIR][CIRGen][Builtin][X86] Masked compress Intrinsics
Added masked compress builtin in CIR.
Note: This is my first PR to llvm.
---
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 75 +++-------------------
1 file changed, 9 insertions(+), 66 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 1c1ef4da20b0d..bbca169593ff7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -88,68 +88,13 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc,
}
return maskVec;
}
+static mlir::Value emitX86CompressExpand(CIRGenFunction &cgf, const CallExpr *expr,ArrayRef<mlir::Value> ops, bool IsCompress, const std::string &ID){
+ auto ResultTy = cast<cir::VectorType>(ops[1].getType());
+ mlir::Value MaskValue = getMaskVecValue(cgf, expr, ops[2], cast<cir::VectorType>(ResultTy).getSize());
+ llvm::SmallVector<mlir::Value, 4> op{ops[0], ops[1], MaskValue};
+
+ return emitIntrinsicCallOp(cgf,expr, ID, ResultTy, op);
-// Builds the VecShuffleOp for pshuflw and pshufhw x86 builtins.
-//
-// The vector is split into lanes of 8 word elements (16 bits). The lower or
-// upper half of each lane, controlled by `isLow`, is shuffled in the following
-// way: The immediate is truncated to 8 bits, separated into 4 2-bit fields. The
-// i-th field's value represents the resulting index of the i-th element in the
-// half lane after shuffling. The other half of the lane remains unchanged.
-static cir::VecShuffleOp emitPshufWord(CIRGenBuilderTy &builder,
- const mlir::Value vec,
- const mlir::Value immediate,
- const mlir::Location loc,
- const bool isLow) {
- uint32_t imm = CIRGenFunction::getZExtIntValueFromConstOp(immediate);
-
- auto vecTy = cast<cir::VectorType>(vec.getType());
- unsigned numElts = vecTy.getSize();
-
- unsigned firstHalfStart = isLow ? 0 : 4;
- unsigned secondHalfStart = 4 - firstHalfStart;
-
- // Splat the 8-bits of immediate 4 times to help the loop wrap around.
- imm = (imm & 0xff) * 0x01010101;
-
- int64_t indices[32];
- for (unsigned l = 0; l != numElts; l += 8) {
- for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
- indices[l + i] = l + (imm & 3) + firstHalfStart;
- imm >>= 2;
- }
- for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
- indices[l + i] = l + i;
- }
-
- return builder.createVecShuffle(loc, vec, ArrayRef(indices, numElts));
-}
-
-// Builds the shuffle mask for pshufd and shufpd/shufps x86 builtins.
-// The shuffle mask is written to outIndices.
-static void
-computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
- uint32_t imm, const bool isShufP,
- llvm::SmallVectorImpl<int64_t> &outIndices) {
- auto vecTy = cast<cir::VectorType>(vec.getType());
- unsigned numElts = vecTy.getSize();
- unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
- unsigned numLaneElts = numElts / numLanes;
-
- // Splat the 8-bits of immediate 4 times to help the loop wrap around.
- imm = (imm & 0xff) * 0x01010101;
-
- for (unsigned l = 0; l != numElts; l += numLaneElts) {
- for (unsigned i = 0; i != numLaneElts; ++i) {
- uint32_t idx = imm % numLaneElts;
- imm /= numLaneElts;
- if (isShufP && i >= (numLaneElts / 2))
- idx += numElts;
- outIndices[l + i] = l + idx;
- }
- }
-
- outIndices.resize(numElts);
}
static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
@@ -747,11 +692,9 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_compresshi512_mask:
case X86::BI__builtin_ia32_compressqi128_mask:
case X86::BI__builtin_ia32_compressqi256_mask:
- case X86::BI__builtin_ia32_compressqi512_mask:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented X86 builtin call: ") +
- getContext().BuiltinInfo.getName(builtinID));
- return {};
+ case X86::BI__builtin_ia32_compressqi512_mask:{
+ return emitX86CompressExpand(*this, expr, ops, true, "x86_avx512_mask_compress");
+ }
case X86::BI__builtin_ia32_gather3div2df:
case X86::BI__builtin_ia32_gather3div2di:
case X86::BI__builtin_ia32_gather3div4df:
>From 1507fce6bd93557d8bc7482bdcddf52f59f78c92 Mon Sep 17 00:00:00 2001
From: bhuvan1527 <balabhuvanvarma at gmail.com>
Date: Thu, 27 Nov 2025 19:59:41 +0530
Subject: [PATCH 2/2] [CIR][CIRGen][Builtin][X86] Masked compress Intrinsics
This pr is related to the issue #167765
Added the support Masked compress builtin in CIR codeGen
---
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 94 +++++++++-
.../CodeGenBuiltins/X86/avx512vl-builtins.c | 33 ++++
.../X86/avx512vlvbmi2-builtins.c | 171 ++++++++++++++++++
3 files changed, 289 insertions(+), 9 deletions(-)
create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index bbca169593ff7..f01a90cc8df8f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -88,13 +88,79 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc,
}
return maskVec;
}
-static mlir::Value emitX86CompressExpand(CIRGenFunction &cgf, const CallExpr *expr,ArrayRef<mlir::Value> ops, bool IsCompress, const std::string &ID){
- auto ResultTy = cast<cir::VectorType>(ops[1].getType());
- mlir::Value MaskValue = getMaskVecValue(cgf, expr, ops[2], cast<cir::VectorType>(ResultTy).getSize());
- llvm::SmallVector<mlir::Value, 4> op{ops[0], ops[1], MaskValue};
-
- return emitIntrinsicCallOp(cgf,expr, ID, ResultTy, op);
+// Builds the VecShuffleOp for pshuflw and pshufhw x86 builtins.
+//
+// The vector is split into lanes of 8 word elements (16 bits). The lower or
+// upper half of each lane, controlled by `isLow`, is shuffled in the following
+// way: The immediate is truncated to 8 bits, separated into 4 2-bit fields. The
+// i-th field's value represents the resulting index of the i-th element in the
+// half lane after shuffling. The other half of the lane remains unchanged.
+static cir::VecShuffleOp emitPshufWord(CIRGenBuilderTy &builder,
+ const mlir::Value vec,
+ const mlir::Value immediate,
+ const mlir::Location loc,
+ const bool isLow) {
+ uint32_t imm = CIRGenFunction::getZExtIntValueFromConstOp(immediate);
+
+ auto vecTy = cast<cir::VectorType>(vec.getType());
+ unsigned numElts = vecTy.getSize();
+
+ unsigned firstHalfStart = isLow ? 0 : 4;
+ unsigned secondHalfStart = 4 - firstHalfStart;
+
+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+ imm = (imm & 0xff) * 0x01010101;
+
+ int64_t indices[32];
+ for (unsigned l = 0; l != numElts; l += 8) {
+ for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
+ indices[l + i] = l + (imm & 3) + firstHalfStart;
+ imm >>= 2;
+ }
+ for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
+ indices[l + i] = l + i;
+ }
+
+ return builder.createVecShuffle(loc, vec, ArrayRef(indices, numElts));
+}
+
+// Builds the shuffle mask for pshufd and shufpd/shufps x86 builtins.
+// The shuffle mask is written to outIndices.
+static void
+computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
+ uint32_t imm, const bool isShufP,
+ llvm::SmallVectorImpl<int64_t> &outIndices) {
+ auto vecTy = cast<cir::VectorType>(vec.getType());
+ unsigned numElts = vecTy.getSize();
+ unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
+ unsigned numLaneElts = numElts / numLanes;
+
+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+ imm = (imm & 0xff) * 0x01010101;
+
+ for (unsigned l = 0; l != numElts; l += numLaneElts) {
+ for (unsigned i = 0; i != numLaneElts; ++i) {
+ uint32_t idx = imm % numLaneElts;
+ imm /= numLaneElts;
+ if (isShufP && i >= (numLaneElts / 2))
+ idx += numElts;
+ outIndices[l + i] = l + idx;
+ }
+ }
+
+ outIndices.resize(numElts);
+}
+static mlir::Value emitX86CompressExpand(CIRGenBuilderTy &builder,
+ mlir::Location loc, mlir::Value source,
+ mlir::Value mask,
+ mlir::Value inputVector,
+ const std::string &id) {
+ auto ResultTy = cast<cir::VectorType>(mask.getType());
+ mlir::Value MaskValue = getMaskVecValue(
+ builder, loc, inputVector, cast<cir::VectorType>(ResultTy).getSize());
+ return emitIntrinsicCallOp(builder, loc, id, ResultTy,
+ mlir::ValueRange{source, mask, MaskValue});
}
static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
@@ -657,6 +723,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_compressstoreqi128_mask:
case X86::BI__builtin_ia32_compressstoreqi256_mask:
case X86::BI__builtin_ia32_compressstoreqi512_mask:
+ cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ return {};
case X86::BI__builtin_ia32_expanddf128_mask:
case X86::BI__builtin_ia32_expanddf256_mask:
case X86::BI__builtin_ia32_expanddf512_mask:
@@ -674,7 +744,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_expandhi512_mask:
case X86::BI__builtin_ia32_expandqi128_mask:
case X86::BI__builtin_ia32_expandqi256_mask:
- case X86::BI__builtin_ia32_expandqi512_mask:
+ case X86::BI__builtin_ia32_expandqi512_mask:{
+ mlir::Location loc = getLoc(expr->getExprLoc());
+ return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+ "x86.avx512.mask.expand");
+}
case X86::BI__builtin_ia32_compressdf128_mask:
case X86::BI__builtin_ia32_compressdf256_mask:
case X86::BI__builtin_ia32_compressdf512_mask:
@@ -693,8 +767,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_compressqi128_mask:
case X86::BI__builtin_ia32_compressqi256_mask:
case X86::BI__builtin_ia32_compressqi512_mask:{
- return emitX86CompressExpand(*this, expr, ops, true, "x86_avx512_mask_compress");
- }
+ mlir::Location loc = getLoc(expr->getExprLoc());
+ return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+ "x86.avx512.mask.compress");
+}
case X86::BI__builtin_ia32_gather3div2df:
case X86::BI__builtin_ia32_gather3div2di:
case X86::BI__builtin_ia32_gather3div4df:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
index accf1f60d7c32..57f51afcea57c 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
@@ -199,3 +199,36 @@ __m256i test_mm256_mask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, __m25
// OGCG: @llvm.x86.avx512.mask.gather3siv8.si
return _mm256_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2);
}
+
+__m128d test_mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+ // CIR-LABEL: _mm_mask_expand_pd
+ // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+ // LLVM-LABEL: test_mm_mask_expand_pd
+ // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+ // OGCG-LABEL: test_mm_mask_expand_pd
+ // OGCG: %[[BC2:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: %[[SHUF2:.*]] = shufflevector <8 x i1> %[[BC2]], <8 x i1> %[[BC2]], <2 x i32> <i32 0, i32 1>
+
+ return _mm_mask_expand_pd(__W,__U,__A);
+}
+
+__m128d test_mm_maskz_expand_pd(__mmask8 __U, __m128d __A) {
+ // CIR-LABEL: _mm_maskz_expand_pd
+ // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+ // LLVM-LABEL: test_mm_maskz_expand_pd
+ // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+ // OGCG-LABEL: test_mm_maskz_expand_pd
+ // OGCG: %[[BC2:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: %[[SHUF2:.*]] = shufflevector <8 x i1> %[[BC2]], <8 x i1> %[[BC2]], <2 x i32> <i32 0, i32 1>
+
+ return _mm_maskz_expand_pd(__U,__A);
+}
+
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
new file mode 100644
index 0000000000000..964971d71eb6c
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
@@ -0,0 +1,171 @@
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+
+__m128i test_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_mask_compress_epi16
+ // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+ // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_mask_compress_epi16
+ // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_mask_compress_epi16
+ // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ return _mm_mask_compress_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_maskz_compress_epi16
+ // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+ // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_maskz_compress_epi16
+ // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_maskz_compress_epi16
+ // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ return _mm_maskz_compress_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_mask_compress_epi8
+ // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+ // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_mask_compress_epi8
+ // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+ // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_mask_compress_epi8
+ // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+ // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ return _mm_mask_compress_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_maskz_compress_epi8
+ // %[[ZERO:.+]] = cir.call @_mm_setzero_si128() : () -> !cir.vector<2 x !s64i>
+ // %[[CAST1:.+]] = cir.cast bitcast %[[ZERO]] : !cir.vector<2 x !s64i> -> !cir.vector<16 x !s8i>
+ // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %[[CAST1]], %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+ // %[[CAST2:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_maskz_compress_epi8
+ // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+ // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+ // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+ // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_maskz_compress_epi8
+ // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+ // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+ // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+ // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ return _mm_maskz_compress_epi8(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_mask_expand_epi16
+ // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+ // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_mask_expand_epi16
+ // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_mask_expand_epi16
+ // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ return _mm_mask_expand_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_maskz_expand_epi16
+ // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+ // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_maskz_expand_epi16
+ // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_maskz_expand_epi16
+ // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+ // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+ // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+ return _mm_maskz_expand_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_mask_expand_epi8
+ // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+ // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_mask_expand_epi8
+ // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+ // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_mask_expand_epi8
+ // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+ // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ return _mm_mask_expand_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) {
+ // CIR-LABEL: test_mm_maskz_expand_epi8
+ // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+ // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+ // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+ // LLVM-LABEL: test_mm_maskz_expand_epi8
+ // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+ // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ // OGCG-LABEL: test_mm_maskz_expand_epi8
+ // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+ // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+ // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+ return _mm_maskz_expand_epi8(__U, __D);
+}
More information about the cfe-commits
mailing list