[clang] [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow AVX/AVX512 subvector extraction intrinsics to be used in constexpr #157712 (PR #158853)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 9 18:56:51 PDT 2025
https://github.com/SeongjaeP updated https://github.com/llvm/llvm-project/pull/158853
>From 53667e100dd379001ee56eeff2da5b127fb07535 Mon Sep 17 00:00:00 2001
From: seongjaep <psjj960507 at gmail.com>
Date: Fri, 12 Sep 2025 14:18:41 +0900
Subject: [PATCH 01/21] [WIP][Clang][ConstExpr] Add initial support for AVX
256->128 extract builtins
---
clang/lib/AST/ExprConstant.cpp | 31 +++++++++++++++++++
.../test/SemaCXX/constexpr-avx-intrinsics.cpp | 25 +++++++++++++++
2 files changed, 56 insertions(+)
create mode 100644 clang/test/SemaCXX/constexpr-avx-intrinsics.cpp
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 35a866ea5010f..4674381c34018 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12027,6 +12027,37 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+
+ case X86::BI__builtin_ia32_vextracti128_si256:
+ case X86::BI__builtin_ia32_vextractf128_pd:
+ case X86::BI__builtin_ia32_vextractf128_ps:
+ case X86::BI__builtin_ia32_vextractf128_si256: {
+ APValue SourceHi, SourceLo, SourceAmt;
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceHi) ||
+ !EvaluateAsRValue(Info, E->getArg(1), SourceLo) ||
+ !EvaluateAsRValue(Info, E->getArg(2), SourceAmt))
+ return false;
+
+ QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ unsigned SourceLen = SourceHi.getVectorLength();
+ SmallVector<APValue, 32> ResultElements;
+ ResultElements.reserve(SourceLen);
+
+ APInt Amt = SourceAmt.getInt();
+ for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
+ APInt Hi = SourceHi.getVectorElt(EltNum).getInt();
+ APInt Lo = SourceLo.getVectorElt(EltNum).getInt();
+ APInt R = llvm::APIntOps::fshl(Hi, Lo, Amt);
+ ResultElements.push_back(
+ APValue(APSInt(R, DestEltTy->isUnsignedIntegerOrEnumerationType())));
+ }
+
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ }
+
+
+
+
case X86::BI__builtin_ia32_vpshldd128:
case X86::BI__builtin_ia32_vpshldd256:
case X86::BI__builtin_ia32_vpshldd512:
diff --git a/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp b/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp
new file mode 100644
index 0000000000000..30e1340601255
--- /dev/null
+++ b/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// expected-no-diagnostics
+
+#include <immintrin.h> // AVX/AVX512 헤더
+
+// // 테스트하려는 AVX/AVX512 내장 함수를 사용하는 constexpr 함수
+// constexpr int test_avx_subvector_extraction() {
+// __m256i a = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+
+// // 이슈의 핵심: 이 내장 함수 호출이 constexpr 문맥에서 가능해야 함
+// __m128i sub = _mm256_extracti128_si256(a, 0);
+
+// return _mm_cvtsi128_si32(sub); // 결과를 int로 변환하여 리턴
+// }
+
+// // 이 상수는 컴파일 시간에 평가되어야 함
+// constexpr int result = test_avx_subvector_extraction();
+
+// static_assert(result == 0, "Incorrect result");
+
+#include <immintrin.h>
+
+constexpr __m128 test(__m256 a) {
+ return _mm256_extractf128_ps(a, 1);
+}
\ No newline at end of file
>From 46458a47192c7ef899336a6a175276b644ac34f8 Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Fri, 12 Sep 2025 21:00:28 +0900
Subject: [PATCH 02/21] [clang] Support constexpr evaluation for AVX/AVX2
extract intrinsics
Implements constexpr evaluation for:
- _mm256_extracti128_si256 (AVX2, VEXTRACTI128)
- _mm256_extractf128_ps
- _mm256_extractf128_pd
- _mm256_extractf128_si256
These now work correctly in constant expressions by extracting
the appropriate 128-bit lane from a 256-bit vector.
---
clang/lib/AST/ExprConstant.cpp | 43 +++++++++++++++-------------------
1 file changed, 19 insertions(+), 24 deletions(-)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 4674381c34018..16567e56cc778 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12028,35 +12028,30 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
- case X86::BI__builtin_ia32_vextracti128_si256:
- case X86::BI__builtin_ia32_vextractf128_pd:
- case X86::BI__builtin_ia32_vextractf128_ps:
+ case X86::BI__builtin_ia32_extract128i256:
+ case X86::BI__builtin_ia32_vextractf128_pd256:
+ case X86::BI__builtin_ia32_vextractf128_ps256:
case X86::BI__builtin_ia32_vextractf128_si256: {
- APValue SourceHi, SourceLo, SourceAmt;
- if (!EvaluateAsRValue(Info, E->getArg(0), SourceHi) ||
- !EvaluateAsRValue(Info, E->getArg(1), SourceLo) ||
- !EvaluateAsRValue(Info, E->getArg(2), SourceAmt))
+ APValue SourceVec, SourceImm;
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
+ !EvaluateAsRValue(Info, E->getArg(1), SourceImm))
return false;
- QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
- unsigned SourceLen = SourceHi.getVectorLength();
- SmallVector<APValue, 32> ResultElements;
- ResultElements.reserve(SourceLen);
-
- APInt Amt = SourceAmt.getInt();
- for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
- APInt Hi = SourceHi.getVectorElt(EltNum).getInt();
- APInt Lo = SourceLo.getVectorElt(EltNum).getInt();
- APInt R = llvm::APIntOps::fshl(Hi, Lo, Amt);
- ResultElements.push_back(
- APValue(APSInt(R, DestEltTy->isUnsignedIntegerOrEnumerationType())));
- }
+ unsigned idx = SourceImm.getInt().getZExtValue() & 1;
+ const auto *RetVT = E->getType()->castAs<VectorType>();
+ unsigned RetLen = RetVT->getNumElements();
+ unsigned SrcLen = SourceVec.getVectorLength();
+ if (SrcLen != RetLen * 2)
+ return false;
+
+ SmallVector<APValue, 16> ResultElements;
+ ResultElements.reserve(RetLen);
- return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ for (unsigned i = 0; i < RetLen; i++)
+ ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
+
+ return Success(APValue(ResultElements.data(), RetLen), E);
}
-
-
-
case X86::BI__builtin_ia32_vpshldd128:
case X86::BI__builtin_ia32_vpshldd256:
>From cc5b2938aeb828dbd58f84f0f5a1dc3dca2a4095 Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Tue, 16 Sep 2025 18:14:57 +0900
Subject: [PATCH 03/21] [clang] Implement constant evaluation for AVX extract
intrinsics (part)
---
clang/lib/AST/ExprConstant.cpp | 111 ++++++++++++++++++++++++++++++++-
1 file changed, 109 insertions(+), 2 deletions(-)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 16567e56cc778..9598d72416f6e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12028,7 +12028,114 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
- case X86::BI__builtin_ia32_extract128i256:
+ case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32
+ case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32
+ case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32
+ case X86::BI__builtin_ia32_extracti64x2_256_mask: // _mm256_extracti64x2_epi64
+ case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64
+ case X86::BI__builtin_ia32_extracti64x4_mask: { // _mm512_extracti64x4_epi64
+ APValue SourceVec, SourceImm, SourceMerge, SourceKmask;
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
+ !EvaluateAsRValue(Info, E->getArg(1), SourceImm) ||
+ !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) ||
+ !EvaluateAsRValue(Info, E->getArg(3), SourceKmask))
+ return false;
+
+ const auto *RetVT = E->getType()->castAs<VectorType>();
+ QualType EltTy = RetVT->getElementType();
+ unsigned RetLen = RetVT->getNumElements();
+
+ if (!SourceVec.isVector())
+ return false;
+ unsigned SrcLen = SourceVec.getVectorLength();
+ if (SrcLen % RetLen != 0)
+ return false;
+
+ unsigned NumLanes = SrcLen / RetLen;
+ unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1);
+
+ // Step 2) Apply kmask (covers plain/mask/maskz):
+ // - plain : headers pass kmask=all-ones; merge is undef → always take Extracted.
+ // - mask : merge=dst; take? Extracted[i] : dst[i]
+ // - maskz : merge=zero; take? Extracted[i] : 0
+ uint64_t KmaskBits = SourceKmask.getInt().getZExtValue();
+
+ auto makeZeroInt = [&]() -> APValue {
+ bool Uns = EltTy->isUnsignedIntegerOrEnumerationType();
+ unsigned BW = Info.Ctx.getIntWidth(EltTy);
+ return APValue(APSInt(APInt(BW, 0), Uns));
+ };
+
+ SmallVector<APValue, 32> ResultElements;
+ ResultElements.reserve(RetLen);
+ for (unsigned i = 0; i < RetLen; i++) {
+ bool Take = (KmaskBits >> i) & 1;
+ if (Take) {
+ ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
+ } else {
+ // For plain (all-ones) this path is never taken.
+ // For mask : merge is the original dst element.
+ // For maskz : headers pass zero vector as merge.
+ const APValue &MergeElt =
+ SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt();
+ ResultElements.push_back(MergeElt);
+ }
+ }
+ return Success(APValue(ResultElements.data(), RetLen), E);
+ }
+
+ case X86::BI__builtin_ia32_extractf32x4_256_mask: // _mm256_extractf32x4_ps _mm256_mask_extractf32x4_ps _mm256_maskz_extractf32x4_ps
+ case X86::BI__builtin_ia32_extractf32x4_mask: // _mm512_extractf32x4_ps _mm512_mask_extractf32x4_ps _mm512_maskz_extractf32x4_ps
+ case X86::BI__builtin_ia32_extractf32x8_mask: // _mm512_extractf32x8_ps _mm512_mask_extractf32x8_ps _mm512_maskz_extractf32x8_ps
+
+ case X86::BI__builtin_ia32_extractf64x2_256_mask: // _mm256_extractf64x2_pd _mm256_mask_extractf64x2_pd _mm256_maskz_extractf64x2_pd
+ case X86::BI__builtin_ia32_extractf64x2_512_mask: // _mm512_extractf64x2_pd _mm512_mask_extractf64x2_pd _mm512_maskz_extractf64x2_pd
+ case X86::BI__builtin_ia32_extractf64x4_mask: { // _mm512_extractf64x4_pd _mm512_mask_extractf64x4_pd _mm512_maskz_extractf64x4_pd
+ APValue SourceVec, SourceImm, SourceMerge, SourceKmask;
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
+ !EvaluateAsRValue(Info, E->getArg(1), SourceImm) ||
+ !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) ||
+ !EvaluateAsRValue(Info, E->getArg(3), SourceKmask))
+ return false;
+
+ const auto *RetVT = E->getType()->castAs<VectorType>();
+ QualType EltTy = RetVT->getElementType();
+ unsigned RetLen = RetVT->getNumElements();
+
+ if (!SourceVec.isVector())
+ return false;
+ unsigned SrcLen = SourceVec.getVectorLength();
+ if (SrcLen % RetLen != 0)
+ return false;
+
+ unsigned NumLanes = SrcLen / RetLen;
+ unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1);
+
+ uint64_t KmaskBits = SourceKmask.getInt().getZExtValue();
+
+ auto makeZeroFP = [&]() -> APValue {
+ const llvm::fltSemantics &Sem =
+ Info.Ctx.getFloatTypeSemantics(EltTy);
+ return APValue(llvm::APFloat::getZero(Sem));
+ };
+
+ SmallVector<APValue, 32> ResultElements;
+ ResultElements.reserve(RetLen);
+ for (unsigned i = 0; i < RetLen; i++) {
+ bool Take = (KmaskBits >> i) & 1;
+ if (Take) {
+ ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
+ } else {
+ const APValue &MergeElt =
+ SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt();
+ ResultElements.push_back(MergeElt);
+ }
+ }
+ return Success(APValue(ResultElements.data(), RetLen), E);
+ }
+
+ // vector extract
+ case X86::BI__builtin_ia32_extract128i256: // avx2
case X86::BI__builtin_ia32_vextractf128_pd256:
case X86::BI__builtin_ia32_vextractf128_ps256:
case X86::BI__builtin_ia32_vextractf128_si256: {
@@ -12044,7 +12151,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (SrcLen != RetLen * 2)
return false;
- SmallVector<APValue, 16> ResultElements;
+ SmallVector<APValue, 32> ResultElements;
ResultElements.reserve(RetLen);
for (unsigned i = 0; i < RetLen; i++)
>From 47f4ad54385644200d9ac6ca0a522b85aa1803b0 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg at yandex-team.com>
Date: Mon, 22 Sep 2025 15:58:27 +0300
Subject: [PATCH 04/21] Add missing #include <cstdlib> (#157840)
std::realloc is declared there
>From e2f3ed27890f390e9ad7fb381af9ae43f09e300c Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Tue, 23 Sep 2025 15:26:23 +0900
Subject: [PATCH 05/21] WIP: in-progress changes
---
clang/lib/AST/ExprConstant.cpp | 61 +++++++++++--------
.../test/SemaCXX/constexpr-avx-intrinsics.cpp | 25 --------
2 files changed, 34 insertions(+), 52 deletions(-)
delete mode 100644 clang/test/SemaCXX/constexpr-avx-intrinsics.cpp
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 9598d72416f6e..27728b64aa84b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12028,6 +12028,39 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ // vector extract
+ case X86::BI__builtin_ia32_extract128i256:
+ case X86::BI__builtin_ia32_vextractf128_pd256:
+ case X86::BI__builtin_ia32_vextractf128_ps256:
+ case X86::BI__builtin_ia32_vextractf128_si256: {
+ APValue SourceVec, SourceImm;
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
+ !EvaluateAsRValue(Info, E->getArg(1), SourceImm))
+ return false;
+
+ if (!SourceVec.isVector())
+ return false;
+
+ const auto *RetVT = E->getType()->castAs<VectorType>();
+ if (!RetVT) return false;
+
+ unsigned RetLen = RetVT->getNumElements();
+ unsigned SrcLen = SourceVec.getVectorLength();
+ if (SrcLen != RetLen * 2)
+ return false;
+
+ unsigned idx = SourceImm.getInt().getZExtValue() & 1;
+
+ SmallVector<APValue, 32> ResultElements;
+ ResultElements.reserve(RetLen);
+
+ for (unsigned i = 0; i < RetLen; i++)
+ ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
+
+ return Success(APValue(ResultElements.data(), RetLen), E);
+ }
+
+ // masked extract (ex: mm512_mask_extract32x4_epi32 / 512 -> 128)
case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32
case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32
case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32
@@ -12127,39 +12160,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
} else {
const APValue &MergeElt =
- SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt();
+ SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroFP();
ResultElements.push_back(MergeElt);
}
}
return Success(APValue(ResultElements.data(), RetLen), E);
}
- // vector extract
- case X86::BI__builtin_ia32_extract128i256: // avx2
- case X86::BI__builtin_ia32_vextractf128_pd256:
- case X86::BI__builtin_ia32_vextractf128_ps256:
- case X86::BI__builtin_ia32_vextractf128_si256: {
- APValue SourceVec, SourceImm;
- if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
- !EvaluateAsRValue(Info, E->getArg(1), SourceImm))
- return false;
-
- unsigned idx = SourceImm.getInt().getZExtValue() & 1;
- const auto *RetVT = E->getType()->castAs<VectorType>();
- unsigned RetLen = RetVT->getNumElements();
- unsigned SrcLen = SourceVec.getVectorLength();
- if (SrcLen != RetLen * 2)
- return false;
-
- SmallVector<APValue, 32> ResultElements;
- ResultElements.reserve(RetLen);
-
- for (unsigned i = 0; i < RetLen; i++)
- ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
-
- return Success(APValue(ResultElements.data(), RetLen), E);
- }
-
case X86::BI__builtin_ia32_vpshldd128:
case X86::BI__builtin_ia32_vpshldd256:
case X86::BI__builtin_ia32_vpshldd512:
diff --git a/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp b/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp
deleted file mode 100644
index 30e1340601255..0000000000000
--- a/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-// expected-no-diagnostics
-
-#include <immintrin.h> // AVX/AVX512 헤더
-
-// // 테스트하려는 AVX/AVX512 내장 함수를 사용하는 constexpr 함수
-// constexpr int test_avx_subvector_extraction() {
-// __m256i a = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-
-// // 이슈의 핵심: 이 내장 함수 호출이 constexpr 문맥에서 가능해야 함
-// __m128i sub = _mm256_extracti128_si256(a, 0);
-
-// return _mm_cvtsi128_si32(sub); // 결과를 int로 변환하여 리턴
-// }
-
-// // 이 상수는 컴파일 시간에 평가되어야 함
-// constexpr int result = test_avx_subvector_extraction();
-
-// static_assert(result == 0, "Incorrect result");
-
-#include <immintrin.h>
-
-constexpr __m128 test(__m256 a) {
- return _mm256_extractf128_ps(a, 1);
-}
\ No newline at end of file
>From 16db57d65106c345c60ed963cf7a4e276b1e17ec Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Wed, 24 Sep 2025 02:09:42 +0900
Subject: [PATCH 06/21] [clang][ByteCode] constexpr-enable X86 AVX/AVX512
subvector extract builtins in InterpBuiltin
- Route AVX/AVX2 vextractf128/ extract128i256 to 2-arg extract helper.
- Route all AVX-512(VL/DQ) extract builtins to unified 4-arg masked helper:
* extractf32x4_{256,_}
* extractf32x8_
* extractf64x2_{256,512}
* extractf64x4_
* extracti32x4_{256,_}
* extracti32x8_
* extracti64x2_{256,512}
* extracti64x4_
- Implement mask/merge/all-ones(mask=plain)/maskz semantics.
- Initialize all elements in the destination vector.
NOTE:
Tests are not included yet. This patch wires up InterpBuiltin support only.
A follow-up patch will add constexpr tests under clang/test/AST/Interp/.
---
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 173 +++++++++++++++++++++++
1 file changed, 173 insertions(+)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 922d67940e22f..c4040158ca440 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -712,6 +712,36 @@ static bool interp__builtin_expect(InterpState &S, CodePtr OpPC,
return true;
}
+
+/// rotateleft(value, amount)
+static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
+ const InterpFrame *Frame,
+ const CallExpr *Call, bool Right) {
+ APSInt Amount = popToAPSInt(S, Call->getArg(1));
+ APSInt Value = popToAPSInt(S, Call->getArg(0));
+
+ APSInt Result;
+ if (Right)
+ Result = APSInt(Value.rotr(Amount.urem(Value.getBitWidth())),
+ /*IsUnsigned=*/true);
+ else // Left.
+ Result = APSInt(Value.rotl(Amount.urem(Value.getBitWidth())),
+ /*IsUnsigned=*/true);
+
+ pushInteger(S, Result, Call->getType());
+ return true;
+}
+
+static bool interp__builtin_ffs(InterpState &S, CodePtr OpPC,
+ const InterpFrame *Frame,
+ const CallExpr *Call) {
+ APSInt Value = popToAPSInt(S, Call->getArg(0));
+
+ uint64_t N = Value.countr_zero();
+ pushInteger(S, N == Value.getBitWidth() ? 0 : N + 1, Call->getType());
+ return true;
+}
+
static bool interp__builtin_addressof(InterpState &S, CodePtr OpPC,
const InterpFrame *Frame,
const CallExpr *Call) {
@@ -2819,6 +2849,127 @@ static bool interp__builtin_elementwise_triop(
return true;
}
+//_builtin_extract
+static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call,
+ unsigned ID) {
+ assert(Call->getNumArgs() == 2);
+
+ // srcimm
+ APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
+ uint64_t Index = ImmAPS.getZExtValue();
+
+ // srcvec
+ const Pointer &Src = S.Stk.pop<Pointer>();
+ if (!Src.getFieldDesc()->isPrimitiveArray())
+ return false;
+
+ // destination (return value)
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+ if (!Dst.getFieldDesc()->isPrimitiveArray())
+ return false;
+
+ unsigned SrcElems = Src.getNumElems();
+ unsigned DstElems = Dst.getNumElems();
+
+ if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0)
+ return false;
+
+ unsigned NumLanes = SrcElems / DstElems;
+ unsigned Lane = static_cast<unsigned>(Index % NumLanes);
+ unsigned ExtractPos = Lane * DstElems;
+
+ // element type
+ PrimType ElemPT = Src.getFieldDesc()->getPrimType();
+ if (ElemPT != Dst.getFieldDesc()->getPrimType())
+ return false;
+
+ TYPE_SWITCH(ElemPT, {
+ for (unsigned I = 0; I != DstElems; ++I) {
+ Dst.elem<T>(I) = Src.elem<T>(ExtractPos + I);
+ }
+ });
+
+ Dst.initializeAllElements();
+ return true;
+}
+
+// __builtin_extract_masked
+static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call,
+ unsigned ID) {
+ assert(Call->getNumArgs() == 4);
+
+ // kmask
+ APSInt KmaskAPS = popToAPSInt(S, Call->getArg(3));
+ uint64_t Kmask = KmaskAPS.getZExtValue();
+
+ // merge
+ const Pointer &Merge = S.Stk.pop<Pointer>();
+ bool HasMergeVec = Merge.isLive() && Merge.getFieldDesc() &&
+ Merge.getFieldDesc()->isPrimitiveArray();
+
+ // srcimm
+ APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
+ uint64_t Index = ImmAPS.getZExtValue();
+
+ // srcvec
+ const Pointer &Src = S.Stk.pop<Pointer>();
+ if (!Src.getFieldDesc()->isPrimitiveArray())
+ return false;
+
+ // dst (return)
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+ if (!Dst.getFieldDesc()->isPrimitiveArray())
+ return false;
+
+ unsigned SrcElems = Src.getNumElems();
+ unsigned DstElems = Dst.getNumElems();
+ if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0)
+ return false;
+
+ unsigned NumLanes = SrcElems / DstElems;
+ unsigned Lane = static_cast<unsigned>(Index % NumLanes);
+ unsigned ExtractPos = Lane * DstElems;
+
+ PrimType ElemPT = Src.getFieldDesc()->getPrimType();
+ if (ElemPT != Dst.getFieldDesc()->getPrimType())
+ return false;
+
+ // Merge vector type/len check(if)
+ if (HasMergeVec) {
+ if (Merge.getFieldDesc()->getPrimType() != ElemPT ||
+ Merge.getNumElems() != DstElems)
+ return false;
+ }
+
+ // generate 0 value
+ auto storeZeroAt = [&](unsigned I) {
+ TYPE_SWITCH(ElemPT, {
+ Dst.elem<T>(I) = T{};
+ });
+ };
+
+ TYPE_SWITCH(ElemPT, {
+ for (unsigned I = 0; I != DstElems; ++I) {
+ bool Take = ((Kmask >> I) & 1) != 0;
+ if (Take) {
+ Dst.elem<T>(I) = Src.elem<T>(ExtractPos + I);
+ } else {
+ if (HasMergeVec) {
+ Dst.elem<T>(I) = Merge.elem<T>(I);
+ } else {
+ storeZeroAt(I);
+ }
+ }
+ }
+ });
+
+ Dst.initializeAllElements();
+ return true;
+}
+
+
static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
const CallExpr *Call,
unsigned ID) {
@@ -3452,6 +3603,28 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
});
+ case X86::BI__builtin_ia32_extract128i256: // _mm256_extracti128
+ case X86::BI__builtin_ia32_vextractf128_pd256: // _mm256_extractf128_ps
+ case X86::BI__builtin_ia32_vextractf128_ps256: // _mm256_extractf128_pd
+ case X86::BI__builtin_ia32_vextractf128_si256: // _mm256_extracti128_si256
+ return interp__builtin_x86_extract_vector(S, OpPC, Call, BuiltinID);
+
+ // AVX-512 / AVX-512VL / AVX-512DQ
+ case X86::BI__builtin_ia32_extractf32x4_256_mask:
+ case X86::BI__builtin_ia32_extractf32x4_mask:
+ case X86::BI__builtin_ia32_extractf32x8_mask:
+ case X86::BI__builtin_ia32_extractf64x2_256_mask:
+ case X86::BI__builtin_ia32_extractf64x2_512_mask:
+ case X86::BI__builtin_ia32_extractf64x4_mask:
+ case X86::BI__builtin_ia32_extracti32x4_256_mask:
+ case X86::BI__builtin_ia32_extracti32x4_mask:
+ case X86::BI__builtin_ia32_extracti32x8_mask:
+ case X86::BI__builtin_ia32_extracti64x2_256_mask:
+ case X86::BI__builtin_ia32_extracti64x2_512_mask:
+ case X86::BI__builtin_ia32_extracti64x4_mask:
+ return interp__builtin_x86_extract_vector_masked(S, OpPC, Call, BuiltinID);
+
+
case clang::X86::BI__builtin_ia32_pavgb128:
case clang::X86::BI__builtin_ia32_pavgw128:
case clang::X86::BI__builtin_ia32_pavgb256:
>From 558e23804ab97873885dc72e61deec39280880a4 Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Wed, 24 Sep 2025 21:16:11 +0900
Subject: [PATCH 07/21] Remove commented code
---
clang/lib/AST/ExprConstant.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 27728b64aa84b..7d2b341fe9ce7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12029,7 +12029,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
}
// vector extract
- case X86::BI__builtin_ia32_extract128i256:
+ case X86::BI__builtin_ia32_extract128i256: // avx2
case X86::BI__builtin_ia32_vextractf128_pd256:
case X86::BI__builtin_ia32_vextractf128_ps256:
case X86::BI__builtin_ia32_vextractf128_si256: {
@@ -12060,12 +12060,11 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), RetLen), E);
}
- // masked extract (ex: mm512_mask_extract32x4_epi32 / 512 -> 128)
case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32
case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32
case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32
case X86::BI__builtin_ia32_extracti64x2_256_mask: // _mm256_extracti64x2_epi64
- case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64
+ case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64
case X86::BI__builtin_ia32_extracti64x4_mask: { // _mm512_extracti64x4_epi64
APValue SourceVec, SourceImm, SourceMerge, SourceKmask;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
>From 315f686d5c59599a2b8e285e7d14ad52ac84469f Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Wed, 24 Sep 2025 21:20:45 +0900
Subject: [PATCH 08/21] Add constexpr tests for AVX/AVX2/AVX-512 extract
intrinsics
---
clang/test/CodeGen/X86/avx-builtins.c | 9 +++
clang/test/CodeGen/X86/avx2-builtins.c | 2 +
clang/test/CodeGen/X86/avx512dq-builtins.c | 68 ++++++++++++++++++++
clang/test/CodeGen/X86/avx512f-builtins.c | 67 +++++++++++++++++++
clang/test/CodeGen/X86/avx512vl-builtins.c | 30 +++++++++
clang/test/CodeGen/X86/avx512vldq-builtins.c | 26 ++++++++
6 files changed, 202 insertions(+)
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 5f08b6be81ab7..5aa69c75aea28 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1071,18 +1071,27 @@ __m128d test_mm256_extractf128_pd(__m256d A) {
return _mm256_extractf128_pd(A, 1);
}
+TEST_CONSTEXPR(match_m128d(_mm256_extractf128_pd(((__m256d){0.0, 1.0, 2.0, 3.0}), 1),
+ 2.0, 3.0));
+
__m128 test_mm256_extractf128_ps(__m256 A) {
// CHECK-LABEL: test_mm256_extractf128_ps
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extractf128_ps(A, 1);
}
+TEST_CONSTEXPR(match_m128(_mm256_extractf128_ps(((__m256){0,1,2,3,4,5,6,7}), 1),
+ 4.0f, 5.0f, 6.0f, 7.0f));
+
__m128i test_mm256_extractf128_si256(__m256i A) {
// CHECK-LABEL: test_mm256_extractf128_si256
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extractf128_si256(A, 1);
}
+TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0,1,2,3,4,5,6,7}), 1),
+ 4, 5, 6, 7));
+
__m256d test_mm256_floor_pd(__m256d x) {
// CHECK-LABEL: test_mm256_floor_pd
// CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 55f18f947b96f..c04d50c893c21 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -479,6 +479,8 @@ __m128i test2_mm256_extracti128_si256(__m256i a) {
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
return _mm256_extracti128_si256(a, 0);
}
+TEST_CONSTEXPR(match_m128i(_mm256_extracti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), 0),
+ 1ULL, 2ULL));
__m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_hadd_epi16
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 4112561216af8..08013705875d0 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1402,6 +1402,11 @@ __m256 test_mm512_extractf32x8_ps(__m512 __A) {
// CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
return _mm512_extractf32x8_ps(__A, 1);
}
+TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){
+ 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f,
+ 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f
+ }), 1),
+ 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
__m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
// CHECK-LABEL: test_mm512_mask_extractf32x8_ps
@@ -1409,6 +1414,15 @@ __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m256(_mm512_mask_extractf32x8_ps(
+ (__m256){0,0,0,0,0,0,0,0}, // W
+ ((__mmask8)0xFF), // U = all ones (plain)
+ (__m512){
+ 0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,
+ 8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f
+ },
+ 1),
+ 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
__m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
// CHECK-LABEL: test_mm512_maskz_extractf32x8_ps
@@ -1416,12 +1430,24 @@ __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm512_maskz_extractf32x8_ps(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m256(_mm512_maskz_extractf32x8_ps(
+ ((__mmask8)0x0F),
+ (__m512){
+ 0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,
+ 8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f
+ },
+ 1),
+ 8.0f, 9.0f, 10.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f));
__m128d test_mm512_extractf64x2_pd(__m512d __A) {
// CHECK-LABEL: test_mm512_extractf64x2_pd
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extractf64x2_pd(__A, 3);
}
+TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(((__m512d){
+ 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0
+ }), 3),
+ 6.0, 7.0));
__m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
// CHECK-LABEL: test_mm512_mask_extractf64x2_pd
@@ -1429,6 +1455,12 @@ __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A)
// CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3);
}
+TEST_CONSTEXPR(match_m128d(_mm512_mask_extractf64x2_pd(
+ (__m128d){100.0, 101.0}, // W(merge)
+ (__mmask8)0x1, // 0000 0001b
+ (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
+ 3),
+ 6.0, 101.0));
__m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
// CHECK-LABEL: test_mm512_maskz_extractf64x2_pd
@@ -1436,12 +1468,21 @@ __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
// CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
return _mm512_maskz_extractf64x2_pd(__U, __A, 3);
}
+TEST_CONSTEXPR(match_m128d(_mm512_maskz_extractf64x2_pd(
+ (__mmask8)0x2, // 0000 0010b
+ (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
+ 3),
+ 0.0, 7.0));
__m256i test_mm512_extracti32x8_epi32(__m512i __A) {
// CHECK-LABEL: test_mm512_extracti32x8_epi32
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
return _mm512_extracti32x8_epi32(__A, 1);
}
+TEST_CONSTEXPR(match_m256i(_mm512_extracti32x8_epi32(((__m512i){
+ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
+ }), 1),
+ 8, 9,10,11,12,13,14,15));
__m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti32x8_epi32
@@ -1449,6 +1490,13 @@ __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m256i(_mm512_mask_extracti32x8_epi32(
+ (__m256i){100,101,102,103,104,105,106,107}, // W(merge)
+ (__mmask8)0xAA, // 1010 1010b → only odd lanetake
+ (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 },
+ 1),
+ // lane0..7: 8,9,10,11,12,13,14,15
+ 100, 9, 102, 11, 104, 13, 106, 15));
__m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_maskz_extracti32x8_epi32
@@ -1456,12 +1504,21 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm512_maskz_extracti32x8_epi32(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti32x8_epi32(
+ (__mmask8)0x0F,
+ (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 },
+ 1),
+ 8, 9, 10, 11, 0, 0, 0, 0));
__m128i test_mm512_extracti64x2_epi64(__m512i __A) {
// CHECK-LABEL: test_mm512_extracti64x2_epi64
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extracti64x2_epi64(__A, 3);
}
+TEST_CONSTEXPR(match_m128i_64(_mm512_extracti64x2_epi64(((__m512i){
+ 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL
+ }), 3),
+ 6ULL, 7ULL));
__m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti64x2_epi64
@@ -1469,6 +1526,12 @@ __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3);
}
+TEST_CONSTEXPR(match_m128i_64(_mm512_mask_extracti64x2_epi64(
+ (__m128i){100ULL, 101ULL}, // W(merge)
+ (__mmask8)0x1, // lane0만 take
+ (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+ 3),
+ 6ULL, 101ULL));
__m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_maskz_extracti64x2_epi64
@@ -1476,6 +1539,11 @@ __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm512_maskz_extracti64x2_epi64(__U, __A, 3);
}
+TEST_CONSTEXPR(match_m128i_64(_mm512_maskz_extracti64x2_epi64(
+ (__mmask8)0x2, // lane1 take, lane0 0
+ (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+ 3),
+ 0ULL, 7ULL));
__m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
// CHECK-LABEL: test_mm512_insertf32x8
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 7756f0da18c03..d37b22285174e 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -2452,6 +2452,11 @@ __m256d test_mm512_extractf64x4_pd(__m512d a)
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm512_extractf64x4_pd(a, 1);
}
+TEST_CONSTEXPR(match_m256d(_mm512_extractf64x4_pd(((__m512d){
+ 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0
+ }), 1),
+ 4.0, 5.0, 6.0, 7.0));
+
__m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){
// CHECK-LABEL: test_mm512_mask_extractf64x4_pd
@@ -2459,6 +2464,12 @@ __m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m256d(_mm512_mask_extractf64x4_pd(
+ (__m256d){100.0,101.0,102.0,103.0}, // W(merge)
+ (__mmask8)0x5, // 0101b
+ (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
+ 1),
+ 4.0, 101.0, 6.0, 103.0));
__m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){
// CHECK-LABEL: test_mm512_maskz_extractf64x4_pd
@@ -2466,6 +2477,11 @@ __m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
}
+TEST_CONSTEXPR(match_m256d(_mm512_maskz_extractf64x4_pd(
+ (__mmask8)0x3,
+ (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
+ 1),
+ 4.0, 5.0, 0.0, 0.0));
__m128 test_mm512_extractf32x4_ps(__m512 a)
{
@@ -2473,6 +2489,10 @@ __m128 test_mm512_extractf32x4_ps(__m512 a)
// CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm512_extractf32x4_ps(a, 1);
}
+TEST_CONSTEXPR(match_m128(_mm512_extractf32x4_ps(((__m512){
+ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
+ }), 1),
+ 4.0f, 5.0f, 6.0f, 7.0f));
__m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){
// CHECK-LABEL: test_mm512_mask_extractf32x4_ps
@@ -2480,6 +2500,12 @@ __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m128(_mm512_mask_extractf32x4_ps(
+ (__m128){100,101,102,103}, // W(merge)
+ (__mmask8)0x5, // 0101b
+ (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15},
+ 1),
+ 4.0f, 101.0f, 6.0f, 103.0f));
__m128 test_mm512_maskz_extractf32x4_ps( __mmask8 __U,__m512 __A){
// CHECK-LABEL: test_mm512_maskz_extractf32x4_ps
@@ -2487,6 +2513,11 @@ __m128 test_mm512_maskz_extractf32x4_ps( __mmask8 __U,__m512 __A){
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm512_maskz_extractf32x4_ps(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m128(_mm512_maskz_extractf32x4_ps(
+ (__mmask8)0x3,
+ (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15},
+ 1),
+ 4.0f, 5.0f, 0.0f, 0.0f));
__mmask16 test_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
// CHECK-LABEL: test_mm512_cmpeq_epu32_mask
@@ -7357,6 +7388,10 @@ __m128i test_mm512_extracti32x4_epi32(__m512i __A) {
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
return _mm512_extracti32x4_epi32(__A, 3);
}
+TEST_CONSTEXPR(match_m128i(_mm512_extracti32x4_epi32(((__m512i){
+ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
+ }), 3),
+ 12, 13, 14, 15));
__m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti32x4_epi32
@@ -7364,6 +7399,15 @@ __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3);
}
+TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti32x4_epi32(
+ (__m128i){100,101,102,103}, // merge=W
+ (__mmask8)0x5, // 0101b
+ (__m512i){
+ 0,1,2,3, 4,5,6,7,
+ 8,9,10,11, 12,13,14,15
+ },
+ 3),
+ 12, 101, 14, 103));
__m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_maskz_extracti32x4_epi32
@@ -7371,12 +7415,24 @@ __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm512_maskz_extracti32x4_epi32(__U, __A, 3);
}
+TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti32x4_epi32(
+ (__mmask8)0x3,
+ (__m512i){
+ 0,1,2,3, 4,5,6,7,
+ 8,9,10,11, 12,13,14,15
+ },
+ 3),
+12, 13, 0, 0));
__m256i test_mm512_extracti64x4_epi64(__m512i __A) {
// CHECK-LABEL: test_mm512_extracti64x4_epi64
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm512_extracti64x4_epi64(__A, 1);
}
+TEST_CONSTEXPR(match_m256i(_mm512_extracti64x4_epi64(((__m512i){
+ 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL
+ }), 1),
+ 4ULL, 5ULL, 6ULL, 7ULL));
__m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti64x4_epi64
@@ -7384,6 +7440,12 @@ __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m256i_64(_mm512_mask_extracti64x4_epi64(
+ (__m256i){100ULL,101ULL,102ULL,103ULL},
+ (__mmask8)0x5,
+ (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+ 1),
+ 4ULL, 101ULL, 6ULL, 103ULL));
__m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_maskz_extracti64x4_epi64
@@ -7391,6 +7453,11 @@ __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
return _mm512_maskz_extracti64x4_epi64(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti64x4_epi64(
+ (__mmask8)0x3,
+ (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+ 1),
+ 4ULL, 5ULL, 0ULL, 0ULL));
__m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) {
// CHECK-LABEL: test_mm512_insertf64x4
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 51385d57d2944..323ac1b2cab63 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -9875,6 +9875,10 @@ __m128 test_mm256_extractf32x4_ps(__m256 __A) {
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extractf32x4_ps(__A, 1);
}
+TEST_CONSTEXPR(match_m128(_mm256_extractf32x4_ps(((__m256){
+ 0,1,2,3, 4,5,6,7
+ }), 1),
+ 4.0f, 5.0f, 6.0f, 7.0f));
__m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
// CHECK-LABEL: test_mm256_mask_extractf32x4_ps
@@ -9882,6 +9886,12 @@ __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1);
}
+TEST_CONSTEXPR( match_m128(_mm256_mask_extractf32x4_ps(
+ (__m128){100,101,102,103}, // W (merge)
+ (__mmask8)0x5, // 0101b
+ (__m256){0,1,2,3, 4,5,6,7},
+ 1),
+ 4.0f, 101.0f, 6.0f, 103.0f));
__m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
// CHECK-LABEL: test_mm256_maskz_extractf32x4_ps
@@ -9889,12 +9899,21 @@ __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm256_maskz_extractf32x4_ps(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m128(_mm256_maskz_extractf32x4_ps(
+ (__mmask8)0x3,
+ (__m256){0,1,2,3, 4,5,6,7},
+ 1),
+ 4.0f, 5.0f, 0.0f, 0.0f));
__m128i test_mm256_extracti32x4_epi32(__m256i __A) {
// CHECK-LABEL: test_mm256_extracti32x4_epi32
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extracti32x4_epi32(__A, 1);
}
+TEST_CONSTEXPR(match_m128i(_mm256_extracti32x4_epi32(((__m256i){
+ 0,1,2,3, 4,5,6,7
+ }), 1),
+ 4, 5, 6, 7));
__m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
// CHECK-LABEL: test_mm256_mask_extracti32x4_epi32
@@ -9902,6 +9921,12 @@ __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti32x4_epi32(
+ (__m128i){100,101,102,103}, // W (merge)
+ (__mmask8)0xA, // 1010b
+ (__m256i){0,1,2,3, 4,5,6,7},
+ 1),
+ 100, 5, 102, 7));
__m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
// CHECK-LABEL: test_mm256_maskz_extracti32x4_epi32
@@ -9909,6 +9934,11 @@ __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm256_maskz_extracti32x4_epi32(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti32x4_epi32(
+ (__mmask8)0x3,
+ (__m256i){0,1,2,3, 4,5,6,7},
+ 1),
+ 4, 5, 0, 0));
__m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) {
// CHECK-LABEL: test_mm256_insertf32x4
diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c
index 938845799acf5..9cfcfea3dafc7 100644
--- a/clang/test/CodeGen/X86/avx512vldq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c
@@ -1083,6 +1083,8 @@ __m128d test_mm256_extractf64x2_pd(__m256d __A) {
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
return _mm256_extractf64x2_pd(__A, 1);
}
+TEST_CONSTEXPR(match_m128d(_mm256_extractf64x2_pd(((__m256d){0.0,1.0,2.0,3.0}), 1),
+ 2.0, 3.0));
__m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
// CHECK-LABEL: test_mm256_mask_extractf64x2_pd
@@ -1090,6 +1092,12 @@ __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A)
// CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m128d(_mm256_mask_extractf64x2_pd(
+ (__m128d){100.0, 101.0}, // W(merge)
+ (__mmask8)0x1,
+ (__m256d){0.0,1.0,2.0,3.0},
+ 1),
+ 2.0, 101.0));
__m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
// CHECK-LABEL: test_mm256_maskz_extractf64x2_pd
@@ -1097,12 +1105,19 @@ __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
// CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
return _mm256_maskz_extractf64x2_pd(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m128d(_mm256_maskz_extractf64x2_pd(
+ (__mmask8)0x2,
+ (__m256d){0.0,1.0,2.0,3.0},
+ 1),
+ 0.0, 3.0));
__m128i test_mm256_extracti64x2_epi64(__m256i __A) {
// CHECK-LABEL: test_mm256_extracti64x2_epi64
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
return _mm256_extracti64x2_epi64(__A, 1);
}
+TEST_CONSTEXPR(match_m128i_64(_mm256_extracti64x2_epi64(((__m256i){0ULL,1ULL,2ULL,3ULL}), 1),
+ 2ULL, 3ULL));
__m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
// CHECK-LABEL: test_mm256_mask_extracti64x2_epi64
@@ -1110,6 +1125,12 @@ __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1);
}
+TEST_CONSTEXPR(match_m128i_64(_mm256_mask_extracti64x2_epi64(
+ (__m128i){100ULL, 101ULL}, // W(merge)
+ (__mmask8)0x1,
+ (__m256i){0ULL,1ULL,2ULL,3ULL},
+ 1),
+ 2ULL, 101ULL));
__m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
// CHECK-LABEL: test_mm256_maskz_extracti64x2_epi64
@@ -1117,6 +1138,11 @@ __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm256_maskz_extracti64x2_epi64(__U, __A, 1);
}
+TEST_CONSTEXPR(match_m128i_64(_mm256_maskz_extracti64x2_epi64(
+ (__mmask8)0x2,
+ (__m256i){0ULL,1ULL,2ULL,3ULL},
+ 1),
+ 0ULL, 3ULL));
__m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) {
// CHECK-LABEL: test_mm256_insertf64x2
>From 8a8b202795fe430b20b7249660ae6dec2cb9199c Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Fri, 26 Sep 2025 11:17:39 +0900
Subject: [PATCH 09/21] Refactoring
---
clang/include/clang/Basic/BuiltinsX86.td | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 217589d7add1d..909bb3b51b8b1 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -481,9 +481,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
- def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
- def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
- def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
def cvtpd2ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, double>)">;
def cvtps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
@@ -504,6 +501,9 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+ def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
+ def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
+ def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
@@ -607,7 +607,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
- def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
}
@@ -652,6 +651,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+ def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
@@ -1065,7 +1065,7 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256
def alignq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
}
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def extractf64x4_mask : X86Builtin<"_Vector<4, double>(_Vector<8, double>, _Constant int, _Vector<4, double>, unsigned char)">;
def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
}
@@ -2944,24 +2944,24 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
def pmovqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
}
-let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def extractf32x8_mask : X86Builtin<"_Vector<8, float>(_Vector<16, float>, _Constant int, _Vector<8, float>, unsigned char)">;
def extractf64x2_512_mask : X86Builtin<"_Vector<2, double>(_Vector<8, double>, _Constant int, _Vector<2, double>, unsigned char)">;
def extracti32x8_mask : X86Builtin<"_Vector<8, int>(_Vector<16, int>, _Constant int, _Vector<8, int>, unsigned char)">;
def extracti64x2_512_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
}
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def extracti32x4_mask : X86Builtin<"_Vector<4, int>(_Vector<16, int>, _Constant int, _Vector<4, int>, unsigned char)">;
def extracti64x4_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, long long int>, _Constant int, _Vector<4, long long int>, unsigned char)">;
}
-let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def extractf64x2_256_mask : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int, _Vector<2, double>, unsigned char)">;
def extracti64x2_256_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def extractf32x4_256_mask : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int, _Vector<4, float>, unsigned char)">;
def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
}
>From 016eaec9c2c9a2a84ce1b7f364060a84ecbd35be Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Fri, 26 Sep 2025 11:20:04 +0900
Subject: [PATCH 10/21] Refactoring and Test Pass
---
clang/test/CodeGen/X86/avx2-builtins.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index c04d50c893c21..de33b72995f5c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -466,6 +466,8 @@ __m128i test0_mm256_extracti128_si256_0(__m256i a) {
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
return _mm256_extracti128_si256(a, 0);
}
+TEST_CONSTEXPR(match_m128i(_mm256_extracti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), 0),
+ 1ULL, 2ULL));
__m128i test1_mm256_extracti128_si256_1(__m256i a) {
// CHECK-LABEL: test1_mm256_extracti128_si256
@@ -479,8 +481,6 @@ __m128i test2_mm256_extracti128_si256(__m256i a) {
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
return _mm256_extracti128_si256(a, 0);
}
-TEST_CONSTEXPR(match_m128i(_mm256_extracti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), 0),
- 1ULL, 2ULL));
__m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_hadd_epi16
>From 1b1987a02e34451bda66070d93465271f5840943 Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Fri, 26 Sep 2025 11:20:56 +0900
Subject: [PATCH 11/21] Refactoring and Test Pass
---
clang/test/CodeGen/X86/avx-builtins.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 5aa69c75aea28..7765468a9472a 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1070,7 +1070,6 @@ __m128d test_mm256_extractf128_pd(__m256d A) {
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
return _mm256_extractf128_pd(A, 1);
}
-
TEST_CONSTEXPR(match_m128d(_mm256_extractf128_pd(((__m256d){0.0, 1.0, 2.0, 3.0}), 1),
2.0, 3.0));
@@ -1079,7 +1078,6 @@ __m128 test_mm256_extractf128_ps(__m256 A) {
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extractf128_ps(A, 1);
}
-
TEST_CONSTEXPR(match_m128(_mm256_extractf128_ps(((__m256){0,1,2,3,4,5,6,7}), 1),
4.0f, 5.0f, 6.0f, 7.0f));
@@ -1088,9 +1086,8 @@ __m128i test_mm256_extractf128_si256(__m256i A) {
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extractf128_si256(A, 1);
}
-
-TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0,1,2,3,4,5,6,7}), 1),
- 4, 5, 6, 7));
+TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0ULL, 1ULL, 2ULL, 3ULL}), 1),
+ 2ULL, 3ULL));
__m256d test_mm256_floor_pd(__m256d x) {
// CHECK-LABEL: test_mm256_floor_pd
>From 1a7013cc5ea89f07518a95c4a653195db503e68a Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Sat, 27 Sep 2025 15:11:00 +0900
Subject: [PATCH 12/21] Refactoring and add avx512dq test
---
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 99 +++++++++++++---------
clang/lib/AST/ExprConstant.cpp | 75 +++-------------
clang/test/CodeGen/X86/avx512dq-builtins.c | 70 ++++++++-------
3 files changed, 106 insertions(+), 138 deletions(-)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index c4040158ca440..60ba4a06bf357 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2896,71 +2896,94 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC,
// __builtin_extract_masked
static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr OpPC,
- const CallExpr *Call,
- unsigned ID) {
- assert(Call->getNumArgs() == 4);
+ const CallExpr *Call,
+ unsigned ID) {
+ unsigned NumArgs = Call->getNumArgs();
- // kmask
- APSInt KmaskAPS = popToAPSInt(S, Call->getArg(3));
- uint64_t Kmask = KmaskAPS.getZExtValue();
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+ if (!Dst.getFieldDesc()->isPrimitiveArray())
+ return false;
- // merge
- const Pointer &Merge = S.Stk.pop<Pointer>();
- bool HasMergeVec = Merge.isLive() && Merge.getFieldDesc() &&
- Merge.getFieldDesc()->isPrimitiveArray();
-
- // srcimm
- APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
- uint64_t Index = ImmAPS.getZExtValue();
+ const Pointer *Merge = nullptr;
+ uint64_t Kmask = 0;
+ uint64_t Imm = 0;
+ const Pointer *Src = nullptr;
- // srcvec
- const Pointer &Src = S.Stk.pop<Pointer>();
- if (!Src.getFieldDesc()->isPrimitiveArray())
+ if (NumArgs == 4) {
+ // __m256 _mm512_mask_extractf32x8_ps(W, U, A, imm)
+ APSInt ImmAPS = popToAPSInt(S, Call->getArg(3));
+ Imm = ImmAPS.getZExtValue();
+
+ const Pointer &SrcP = S.Stk.pop<Pointer>();
+ Src = &SrcP;
+
+ APSInt KmaskAPS = popToAPSInt(S, Call->getArg(1));
+ Kmask = KmaskAPS.getZExtValue();
+
+ const Pointer &MergeP = S.Stk.pop<Pointer>();
+ Merge = &MergeP;
+
+ } else if (NumArgs == 3) {
+ // __m256 _mm512_maskz_extractf32x8_ps(U, A, imm)
+ APSInt ImmAPS = popToAPSInt(S, Call->getArg(2));
+ Imm = ImmAPS.getZExtValue();
+
+ const Pointer &SrcP = S.Stk.pop<Pointer>();
+ Src = &SrcP;
+
+ APSInt KmaskAPS = popToAPSInt(S, Call->getArg(0));
+ Kmask = KmaskAPS.getZExtValue();
+
+ Merge = nullptr; // maskz → zero fill
+ } else {
return false;
+ }
- // dst (return)
- const Pointer &Dst = S.Stk.peek<Pointer>();
- if (!Dst.getFieldDesc()->isPrimitiveArray())
+ if (!Src->getFieldDesc()->isPrimitiveArray())
return false;
- unsigned SrcElems = Src.getNumElems();
+ unsigned SrcElems = Src->getNumElems();
unsigned DstElems = Dst.getNumElems();
if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0)
return false;
unsigned NumLanes = SrcElems / DstElems;
- unsigned Lane = static_cast<unsigned>(Index % NumLanes);
+ unsigned Lane = static_cast<unsigned>(Imm % NumLanes);
unsigned ExtractPos = Lane * DstElems;
- PrimType ElemPT = Src.getFieldDesc()->getPrimType();
+ PrimType ElemPT = Src->getFieldDesc()->getPrimType();
if (ElemPT != Dst.getFieldDesc()->getPrimType())
return false;
- // Merge vector type/len check(if)
- if (HasMergeVec) {
- if (Merge.getFieldDesc()->getPrimType() != ElemPT ||
- Merge.getNumElems() != DstElems)
- return false;
- }
+ // --- 여기서 fast-path 추가 ---
+ unsigned UsedBits = std::min<unsigned>(DstElems, 64); // mask 폭 제한
+ uint64_t AllOnes = (UsedBits == 64 ? ~0ull : ((1ull << UsedBits) - 1));
+ bool MaskAll = (Kmask & AllOnes) == AllOnes;
- // generate 0 value
- auto storeZeroAt = [&](unsigned I) {
+ if (MaskAll) {
+ // merge는 무시, src에서 그대로 복사
TYPE_SWITCH(ElemPT, {
- Dst.elem<T>(I) = T{};
+ for (unsigned I = 0; I != DstElems; ++I)
+ Dst.elem<T>(I) = Src->elem<T>(ExtractPos + I);
});
+ Dst.initializeAllElements();
+ return true;
+ }
+ // --- fast-path 끝 ---
+
+ auto storeZeroAt = [&](unsigned I) {
+ TYPE_SWITCH(ElemPT, { Dst.elem<T>(I) = T{}; });
};
TYPE_SWITCH(ElemPT, {
for (unsigned I = 0; I != DstElems; ++I) {
bool Take = ((Kmask >> I) & 1) != 0;
if (Take) {
- Dst.elem<T>(I) = Src.elem<T>(ExtractPos + I);
+ Dst.elem<T>(I) = Src->elem<T>(ExtractPos + I);
+ } else if (Merge) {
+ Dst.elem<T>(I) = Merge->elem<T>(I);
} else {
- if (HasMergeVec) {
- Dst.elem<T>(I) = Merge.elem<T>(I);
- } else {
- storeZeroAt(I);
- }
+ storeZeroAt(I);
}
}
});
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 7d2b341fe9ce7..22057955d5160 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12060,12 +12060,16 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), RetLen), E);
}
- case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32
- case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32
- case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32
- case X86::BI__builtin_ia32_extracti64x2_256_mask: // _mm256_extracti64x2_epi64
- case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64
- case X86::BI__builtin_ia32_extracti64x4_mask: { // _mm512_extracti64x4_epi64
+ case X86::BI__builtin_ia32_extracti32x4_256_mask:
+ case X86::BI__builtin_ia32_extractf32x4_256_mask:
+ case X86::BI__builtin_ia32_extracti32x4_mask:
+ case X86::BI__builtin_ia32_extractf32x4_mask:
+ case X86::BI__builtin_ia32_extracti32x8_mask:
+ case X86::BI__builtin_ia32_extractf32x8_mask:
+ case X86::BI__builtin_ia32_extracti64x2_256_mask:
+ case X86::BI__builtin_ia32_extractf64x2_256_mask:
+ case X86::BI__builtin_ia32_extracti64x2_512_mask:
+ case X86::BI__builtin_ia32_extractf64x2_512_mask: {
APValue SourceVec, SourceImm, SourceMerge, SourceKmask;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceImm) ||
@@ -12085,11 +12089,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
unsigned NumLanes = SrcLen / RetLen;
unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1);
-
- // Step 2) Apply kmask (covers plain/mask/maskz):
- // - plain : headers pass kmask=all-ones; merge is undef → always take Extracted.
- // - mask : merge=dst; take? Extracted[i] : dst[i]
- // - maskz : merge=zero; take? Extracted[i] : 0
+
uint64_t KmaskBits = SourceKmask.getInt().getZExtValue();
auto makeZeroInt = [&]() -> APValue {
@@ -12105,9 +12105,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (Take) {
ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
} else {
- // For plain (all-ones) this path is never taken.
- // For mask : merge is the original dst element.
- // For maskz : headers pass zero vector as merge.
+
const APValue &MergeElt =
SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt();
ResultElements.push_back(MergeElt);
@@ -12116,55 +12114,6 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), RetLen), E);
}
- case X86::BI__builtin_ia32_extractf32x4_256_mask: // _mm256_extractf32x4_ps _mm256_mask_extractf32x4_ps _mm256_maskz_extractf32x4_ps
- case X86::BI__builtin_ia32_extractf32x4_mask: // _mm512_extractf32x4_ps _mm512_mask_extractf32x4_ps _mm512_maskz_extractf32x4_ps
- case X86::BI__builtin_ia32_extractf32x8_mask: // _mm512_extractf32x8_ps _mm512_mask_extractf32x8_ps _mm512_maskz_extractf32x8_ps
-
- case X86::BI__builtin_ia32_extractf64x2_256_mask: // _mm256_extractf64x2_pd _mm256_mask_extractf64x2_pd _mm256_maskz_extractf64x2_pd
- case X86::BI__builtin_ia32_extractf64x2_512_mask: // _mm512_extractf64x2_pd _mm512_mask_extractf64x2_pd _mm512_maskz_extractf64x2_pd
- case X86::BI__builtin_ia32_extractf64x4_mask: { // _mm512_extractf64x4_pd _mm512_mask_extractf64x4_pd _mm512_maskz_extractf64x4_pd
- APValue SourceVec, SourceImm, SourceMerge, SourceKmask;
- if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
- !EvaluateAsRValue(Info, E->getArg(1), SourceImm) ||
- !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) ||
- !EvaluateAsRValue(Info, E->getArg(3), SourceKmask))
- return false;
-
- const auto *RetVT = E->getType()->castAs<VectorType>();
- QualType EltTy = RetVT->getElementType();
- unsigned RetLen = RetVT->getNumElements();
-
- if (!SourceVec.isVector())
- return false;
- unsigned SrcLen = SourceVec.getVectorLength();
- if (SrcLen % RetLen != 0)
- return false;
-
- unsigned NumLanes = SrcLen / RetLen;
- unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1);
-
- uint64_t KmaskBits = SourceKmask.getInt().getZExtValue();
-
- auto makeZeroFP = [&]() -> APValue {
- const llvm::fltSemantics &Sem =
- Info.Ctx.getFloatTypeSemantics(EltTy);
- return APValue(llvm::APFloat::getZero(Sem));
- };
-
- SmallVector<APValue, 32> ResultElements;
- ResultElements.reserve(RetLen);
- for (unsigned i = 0; i < RetLen; i++) {
- bool Take = (KmaskBits >> i) & 1;
- if (Take) {
- ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
- } else {
- const APValue &MergeElt =
- SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroFP();
- ResultElements.push_back(MergeElt);
- }
- }
- return Success(APValue(ResultElements.data(), RetLen), E);
- }
case X86::BI__builtin_ia32_vpshldd128:
case X86::BI__builtin_ia32_vpshldd256:
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 08013705875d0..5a61040db9ef3 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1415,12 +1415,12 @@ __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1);
}
TEST_CONSTEXPR(match_m256(_mm512_mask_extractf32x8_ps(
- (__m256){0,0,0,0,0,0,0,0}, // W
- ((__mmask8)0xFF), // U = all ones (plain)
- (__m512){
+ ((__m256)(__v8sf){0,0,0,0,0,0,0,0}), // W
+ (__mmask8)0xFF,
+ ((__m512)(__v16sf){
0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,
8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f
- },
+ }),
1),
8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
@@ -1431,11 +1431,11 @@ __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
return _mm512_maskz_extractf32x8_ps(__U, __A, 1);
}
TEST_CONSTEXPR(match_m256(_mm512_maskz_extractf32x8_ps(
- ((__mmask8)0x0F),
- (__m512){
+ (__mmask8)0x0F,
+ ((__m512)(__v16sf){
0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,
8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f
- },
+ }),
1),
8.0f, 9.0f, 10.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f));
@@ -1444,9 +1444,8 @@ __m128d test_mm512_extractf64x2_pd(__m512d __A) {
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extractf64x2_pd(__A, 3);
}
-TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(((__m512d){
- 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0
- }), 3),
+TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(
+ ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3),
6.0, 7.0));
__m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
@@ -1456,9 +1455,9 @@ __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A)
return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3);
}
TEST_CONSTEXPR(match_m128d(_mm512_mask_extractf64x2_pd(
- (__m128d){100.0, 101.0}, // W(merge)
- (__mmask8)0x1, // 0000 0001b
- (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
+ ((__m128d)(__v2df){100.0, 101.0}), // W(merge)
+ (__mmask8)0x1,
+ ((__m512d)(__v8df){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}),
3),
6.0, 101.0));
@@ -1469,8 +1468,8 @@ __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
return _mm512_maskz_extractf64x2_pd(__U, __A, 3);
}
TEST_CONSTEXPR(match_m128d(_mm512_maskz_extractf64x2_pd(
- (__mmask8)0x2, // 0000 0010b
- (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
+ (__mmask8)0x2,
+ ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}),
3),
0.0, 7.0));
@@ -1479,9 +1478,8 @@ __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
return _mm512_extracti32x8_epi32(__A, 1);
}
-TEST_CONSTEXPR(match_m256i(_mm512_extracti32x8_epi32(((__m512i){
- 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
- }), 1),
+TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32(
+ ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1),
8, 9,10,11,12,13,14,15));
__m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
@@ -1490,12 +1488,11 @@ __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1);
}
-TEST_CONSTEXPR(match_m256i(_mm512_mask_extracti32x8_epi32(
- (__m256i){100,101,102,103,104,105,106,107}, // W(merge)
- (__mmask8)0xAA, // 1010 1010b → only odd lanetake
- (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 },
+TEST_CONSTEXPR(match_v8si(_mm512_mask_extracti32x8_epi32(
+ ((__m256i)(__v8si){100,101,102,103,104,105,106,107}),
+ (__mmask8)0xAA,
+ ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
1),
- // lane0..7: 8,9,10,11,12,13,14,15
100, 9, 102, 11, 104, 13, 106, 15));
__m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
@@ -1504,9 +1501,9 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm512_maskz_extracti32x8_epi32(__U, __A, 1);
}
-TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti32x8_epi32(
- (__mmask8)0x0F,
- (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 },
+TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32(
+ (__mmask8)0x0F,
+ ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
1),
8, 9, 10, 11, 0, 0, 0, 0));
@@ -1515,9 +1512,8 @@ __m128i test_mm512_extracti64x2_epi64(__m512i __A) {
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extracti64x2_epi64(__A, 3);
}
-TEST_CONSTEXPR(match_m128i_64(_mm512_extracti64x2_epi64(((__m512i){
- 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL
- }), 3),
+TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64(
+ ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3),
6ULL, 7ULL));
__m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
@@ -1526,10 +1522,10 @@ __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3);
}
-TEST_CONSTEXPR(match_m128i_64(_mm512_mask_extracti64x2_epi64(
- (__m128i){100ULL, 101ULL}, // W(merge)
- (__mmask8)0x1, // lane0만 take
- (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti64x2_epi64(
+ ((__m128i)(__v2di){100ULL, 101ULL}),
+ (__mmask8)0x1,
+ ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}),
3),
6ULL, 101ULL));
@@ -1539,11 +1535,11 @@ __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm512_maskz_extracti64x2_epi64(__U, __A, 3);
}
-TEST_CONSTEXPR(match_m128i_64(_mm512_maskz_extracti64x2_epi64(
- (__mmask8)0x2, // lane1 take, lane0 0
- (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti64x2_epi64(
+ (__mmask8)0x2,
+ ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}),
3),
- 0ULL, 7ULL));
+ 0ULL, 7ULL))
__m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
// CHECK-LABEL: test_mm512_insertf32x8
>From b002c17acde0d58809abeaa9bb0e25b1a2b928d5 Mon Sep 17 00:00:00 2001
From: seongjaep <psjj960507 at gmail.com>
Date: Sun, 28 Sep 2025 18:53:45 +0900
Subject: [PATCH 13/21] no mask version test
---
clang/test/CodeGen/X86/avx512dq-builtins.c | 40 +++++++++++-----------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 5a61040db9ef3..e9f344b240329 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1402,11 +1402,11 @@ __m256 test_mm512_extractf32x8_ps(__m512 __A) {
// CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
return _mm512_extractf32x8_ps(__A, 1);
}
-TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){
- 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f,
- 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f
- }), 1),
- 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
+// TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){
+// 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f,
+// 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f
+// }), 1),
+// 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
__m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
// CHECK-LABEL: test_mm512_mask_extractf32x8_ps
@@ -1444,9 +1444,9 @@ __m128d test_mm512_extractf64x2_pd(__m512d __A) {
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extractf64x2_pd(__A, 3);
}
-TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(
- ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3),
- 6.0, 7.0));
+// TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(
+// ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3),
+// 6.0, 7.0));
__m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
// CHECK-LABEL: test_mm512_mask_extractf64x2_pd
@@ -1478,9 +1478,9 @@ __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
return _mm512_extracti32x8_epi32(__A, 1);
}
-TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32(
- ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1),
- 8, 9,10,11,12,13,14,15));
+// TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32(
+// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1),
+// 8, 9,10,11,12,13,14,15));
__m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti32x8_epi32
@@ -1501,20 +1501,20 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm512_maskz_extracti32x8_epi32(__U, __A, 1);
}
-TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32(
- (__mmask8)0x0F,
- ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
- 1),
- 8, 9, 10, 11, 0, 0, 0, 0));
+// TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32(
+// (__mmask8)0x0F,
+// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
+// 1),
+// 8, 9, 10, 11, 0, 0, 0, 0));
__m128i test_mm512_extracti64x2_epi64(__m512i __A) {
// CHECK-LABEL: test_mm512_extracti64x2_epi64
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extracti64x2_epi64(__A, 3);
}
-TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64(
- ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3),
- 6ULL, 7ULL));
+// TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64(
+// ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3),
+// 6ULL, 7ULL));
__m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti64x2_epi64
@@ -1539,7 +1539,7 @@ TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti64x2_epi64(
(__mmask8)0x2,
((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}),
3),
- 0ULL, 7ULL))
+ 0ULL, 7ULL));
__m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
// CHECK-LABEL: test_mm512_insertf32x8
>From de74751c42efb89af7a8c0c60f93f008fedb0f30 Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Mon, 29 Sep 2025 15:54:18 +0900
Subject: [PATCH 14/21] fix for test undefined -> setzero
---
clang/lib/Headers/avx512dqintrin.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index fb65bf933b8ad..0ff776b36436e 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1214,7 +1214,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
#define _mm512_extractf32x8_ps(A, imm) \
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v8sf)_mm256_undefined_ps(), \
+ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1))
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
@@ -1230,7 +1230,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
#define _mm512_extractf64x2_pd(A, imm) \
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
(int)(imm), \
- (__v2df)_mm_undefined_pd(), \
+ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1))
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
@@ -1247,7 +1247,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
#define _mm512_extracti32x8_epi32(A, imm) \
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v8si)_mm256_undefined_si256(), \
+ (__v8si)_mm256_setzero_si256(), \
(__mmask8)-1))
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
@@ -1263,7 +1263,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
#define _mm512_extracti64x2_epi64(A, imm) \
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
(int)(imm), \
- (__v2di)_mm_undefined_si128(), \
+ (__v2di)_mm_setzero_si128(), \
(__mmask8)-1))
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
>From 5e3c103944ddae98c62f4fa80f1118e272cf535b Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Mon, 29 Sep 2025 23:35:06 +0900
Subject: [PATCH 15/21] refactoring
---
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 99 ++++++----------------
clang/lib/AST/ExprConstant.cpp | 60 ++++++-------
clang/test/CodeGen/X86/avx512dq-builtins.c | 38 ++++-----
3 files changed, 68 insertions(+), 129 deletions(-)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 60ba4a06bf357..05ef09b3cbaee 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2894,97 +2894,46 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC,
return true;
}
-// __builtin_extract_masked
static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr OpPC,
const CallExpr *Call,
unsigned ID) {
- unsigned NumArgs = Call->getNumArgs();
-
- const Pointer &Dst = S.Stk.peek<Pointer>();
- if (!Dst.getFieldDesc()->isPrimitiveArray())
- return false;
-
- const Pointer *Merge = nullptr;
- uint64_t Kmask = 0;
- uint64_t Imm = 0;
- const Pointer *Src = nullptr;
-
- if (NumArgs == 4) {
- // __m256 _mm512_mask_extractf32x8_ps(W, U, A, imm)
- APSInt ImmAPS = popToAPSInt(S, Call->getArg(3));
- Imm = ImmAPS.getZExtValue();
-
- const Pointer &SrcP = S.Stk.pop<Pointer>();
- Src = &SrcP;
-
- APSInt KmaskAPS = popToAPSInt(S, Call->getArg(1));
- Kmask = KmaskAPS.getZExtValue();
+ assert(Call->getNumArgs() == 4);
- const Pointer &MergeP = S.Stk.pop<Pointer>();
- Merge = &MergeP;
-
- } else if (NumArgs == 3) {
- // __m256 _mm512_maskz_extractf32x8_ps(U, A, imm)
- APSInt ImmAPS = popToAPSInt(S, Call->getArg(2));
- Imm = ImmAPS.getZExtValue();
-
- const Pointer &SrcP = S.Stk.pop<Pointer>();
- Src = &SrcP;
-
- APSInt KmaskAPS = popToAPSInt(S, Call->getArg(0));
- Kmask = KmaskAPS.getZExtValue();
+ APSInt UAPS = popToAPSInt(S, Call->getArg(3));
+ const Pointer &W = S.Stk.pop<Pointer>();
+ APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
+ const Pointer &A = S.Stk.pop<Pointer>();
- Merge = nullptr; // maskz → zero fill
- } else {
+ if (!A.getFieldDesc()->isPrimitiveArray() || !W.getFieldDesc()->isPrimitiveArray())
return false;
- }
- if (!Src->getFieldDesc()->isPrimitiveArray())
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+ if (!Dst.getFieldDesc()->isPrimitiveArray())
return false;
- unsigned SrcElems = Src->getNumElems();
+ unsigned SrcElems = A.getNumElems();
unsigned DstElems = Dst.getNumElems();
- if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0)
+ if (!SrcElems || !DstElems || (SrcElems % DstElems) != 0)
return false;
- unsigned NumLanes = SrcElems / DstElems;
- unsigned Lane = static_cast<unsigned>(Imm % NumLanes);
- unsigned ExtractPos = Lane * DstElems;
-
- PrimType ElemPT = Src->getFieldDesc()->getPrimType();
- if (ElemPT != Dst.getFieldDesc()->getPrimType())
+ // 타입 일치 체크
+ PrimType PT = A.getFieldDesc()->getPrimType();
+ if (PT != Dst.getFieldDesc()->getPrimType() ||
+ PT != W.getFieldDesc()->getPrimType())
return false;
- // --- 여기서 fast-path 추가 ---
- unsigned UsedBits = std::min<unsigned>(DstElems, 64); // mask 폭 제한
- uint64_t AllOnes = (UsedBits == 64 ? ~0ull : ((1ull << UsedBits) - 1));
- bool MaskAll = (Kmask & AllOnes) == AllOnes;
+ unsigned numLanes = SrcElems / DstElems;
+ unsigned lane = static_cast<unsigned>(ImmAPS.getZExtValue() % numLanes);
+ unsigned base = lane * DstElems;
- if (MaskAll) {
- // merge는 무시, src에서 그대로 복사
- TYPE_SWITCH(ElemPT, {
- for (unsigned I = 0; I != DstElems; ++I)
- Dst.elem<T>(I) = Src->elem<T>(ExtractPos + I);
- });
- Dst.initializeAllElements();
- return true;
- }
- // --- fast-path 끝 ---
-
- auto storeZeroAt = [&](unsigned I) {
- TYPE_SWITCH(ElemPT, { Dst.elem<T>(I) = T{}; });
- };
+ uint64_t U = UAPS.getZExtValue();
- TYPE_SWITCH(ElemPT, {
- for (unsigned I = 0; I != DstElems; ++I) {
- bool Take = ((Kmask >> I) & 1) != 0;
- if (Take) {
- Dst.elem<T>(I) = Src->elem<T>(ExtractPos + I);
- } else if (Merge) {
- Dst.elem<T>(I) = Merge->elem<T>(I);
- } else {
- storeZeroAt(I);
- }
+ TYPE_SWITCH(PT, {
+ for (unsigned i = 0; i < DstElems; ++i) {
+ if ((U >> i) & 1)
+ Dst.elem<T>(i) = A.elem<T>(base + i);
+ else
+ Dst.elem<T>(i) = W.elem<T>(i);
}
});
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 22057955d5160..327265b79d101 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12069,49 +12069,39 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case X86::BI__builtin_ia32_extracti64x2_256_mask:
case X86::BI__builtin_ia32_extractf64x2_256_mask:
case X86::BI__builtin_ia32_extracti64x2_512_mask:
- case X86::BI__builtin_ia32_extractf64x2_512_mask: {
- APValue SourceVec, SourceImm, SourceMerge, SourceKmask;
- if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
- !EvaluateAsRValue(Info, E->getArg(1), SourceImm) ||
- !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) ||
- !EvaluateAsRValue(Info, E->getArg(3), SourceKmask))
- return false;
+ case X86::BI__builtin_ia32_extractf64x2_512_mask:
+ case X86::BI__builtin_ia32_extractf64x4_mask:{
+ APValue A, W;
+ APSInt Imm, U;
+
+ if (!EvaluateAsRValue(Info, E->getArg(0), A) || // A
+ !EvaluateInteger(E->getArg(1), Imm, Info) || // imm
+ !EvaluateAsRValue(Info, E->getArg(2), W) || // W (merge)
+ !EvaluateInteger(E->getArg(3), U, Info)) // U (mask)
+ return false;
const auto *RetVT = E->getType()->castAs<VectorType>();
- QualType EltTy = RetVT->getElementType();
+ // QualType EltTy = RetVT->getElementType();
unsigned RetLen = RetVT->getNumElements();
- if (!SourceVec.isVector())
- return false;
- unsigned SrcLen = SourceVec.getVectorLength();
- if (SrcLen % RetLen != 0)
- return false;
-
- unsigned NumLanes = SrcLen / RetLen;
- unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1);
-
- uint64_t KmaskBits = SourceKmask.getInt().getZExtValue();
-
- auto makeZeroInt = [&]() -> APValue {
- bool Uns = EltTy->isUnsignedIntegerOrEnumerationType();
- unsigned BW = Info.Ctx.getIntWidth(EltTy);
- return APValue(APSInt(APInt(BW, 0), Uns));
- };
+ if (!A.isVector() || !W.isVector()) return false;
+ unsigned SrcLen = A.getVectorLength();
+ if (!SrcLen || !RetLen || (SrcLen % RetLen) != 0) return false;
+
+ unsigned lanes = SrcLen / RetLen;
+ unsigned lane = static_cast<unsigned>(Imm.getZExtValue() % lanes);
+ unsigned base = lane * RetLen;
+ uint64_t K = U.getZExtValue();
SmallVector<APValue, 32> ResultElements;
ResultElements.reserve(RetLen);
- for (unsigned i = 0; i < RetLen; i++) {
- bool Take = (KmaskBits >> i) & 1;
- if (Take) {
- ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
- } else {
-
- const APValue &MergeElt =
- SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt();
- ResultElements.push_back(MergeElt);
- }
+ for (unsigned i = 0; i < RetLen; ++i) {
+ if ((K >> i) & 1)
+ ResultElements.push_back(A.getVectorElt(base + i));
+ else
+ ResultElements.push_back(W.getVectorElt(i)); // maskz/unmasked 모두 헤더에서 맞춰줌
}
- return Success(APValue(ResultElements.data(), RetLen), E);
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index e9f344b240329..f6ff1828cb41d 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1402,11 +1402,11 @@ __m256 test_mm512_extractf32x8_ps(__m512 __A) {
// CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
return _mm512_extractf32x8_ps(__A, 1);
}
-// TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){
-// 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f,
-// 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f
-// }), 1),
-// 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
+TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){
+ 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f,
+ 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f
+ }), 1),
+ 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
__m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
// CHECK-LABEL: test_mm512_mask_extractf32x8_ps
@@ -1444,9 +1444,9 @@ __m128d test_mm512_extractf64x2_pd(__m512d __A) {
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extractf64x2_pd(__A, 3);
}
-// TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(
-// ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3),
-// 6.0, 7.0));
+TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(
+ ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3),
+ 6.0, 7.0));
__m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
// CHECK-LABEL: test_mm512_mask_extractf64x2_pd
@@ -1478,9 +1478,9 @@ __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
return _mm512_extracti32x8_epi32(__A, 1);
}
-// TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32(
-// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1),
-// 8, 9,10,11,12,13,14,15));
+TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32(
+ ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1),
+ 8, 9,10,11,12,13,14,15));
__m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti32x8_epi32
@@ -1501,20 +1501,20 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm512_maskz_extracti32x8_epi32(__U, __A, 1);
}
-// TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32(
-// (__mmask8)0x0F,
-// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
-// 1),
-// 8, 9, 10, 11, 0, 0, 0, 0));
+TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32(
+ (__mmask8)0x0F,
+ ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
+ 1),
+ 8, 9, 10, 11, 0, 0, 0, 0));
__m128i test_mm512_extracti64x2_epi64(__m512i __A) {
// CHECK-LABEL: test_mm512_extracti64x2_epi64
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
return _mm512_extracti64x2_epi64(__A, 3);
}
-// TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64(
-// ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3),
-// 6ULL, 7ULL));
+TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64(
+ ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3),
+ 6ULL, 7ULL));
__m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti64x2_epi64
>From 05d6d8b72f982d16e10e93bcbedfd627471385e2 Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Mon, 29 Sep 2025 23:51:58 +0900
Subject: [PATCH 16/21] Add _extracti64x4_mask
---
clang/lib/AST/ExprConstant.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 327265b79d101..ae59da87a3ccd 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12070,6 +12070,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case X86::BI__builtin_ia32_extractf64x2_256_mask:
case X86::BI__builtin_ia32_extracti64x2_512_mask:
case X86::BI__builtin_ia32_extractf64x2_512_mask:
+ case X86::BI__builtin_ia32_extracti64x4_mask:
case X86::BI__builtin_ia32_extractf64x4_mask:{
APValue A, W;
APSInt Imm, U;
>From b9d0cdbb15385878f7ae052debc63493732cacff Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Mon, 29 Sep 2025 23:52:50 +0900
Subject: [PATCH 17/21] refactoring and add test code
---
clang/lib/Headers/avx512fintrin.h | 8 +--
clang/test/CodeGen/X86/avx512f-builtins.c | 87 ++++++++++++-----------
2 files changed, 48 insertions(+), 47 deletions(-)
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 80e58425cdd71..2768a5bae887d 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3166,7 +3166,7 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
#define _mm512_extractf64x4_pd(A, I) \
((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
- (__v4df)_mm256_undefined_pd(), \
+ (__v4df)_mm256_setzero_pd(), \
(__mmask8)-1))
#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
@@ -3181,7 +3181,7 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
#define _mm512_extractf32x4_ps(A, I) \
((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
- (__v4sf)_mm_undefined_ps(), \
+ (__v4sf)_mm_setzero_ps(), \
(__mmask8)-1))
#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
@@ -7107,7 +7107,7 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
#define _mm512_extracti32x4_epi32(A, imm) \
((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)_mm_undefined_si128(), \
+ (__v4si)_mm_setzero_si128(), \
(__mmask8)-1))
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
@@ -7122,7 +7122,7 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
#define _mm512_extracti64x4_epi64(A, imm) \
((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)_mm256_undefined_si256(), \
+ (__v4di)_mm256_setzero_si256(), \
(__mmask8)-1))
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index d37b22285174e..7271e200bcaec 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -2452,11 +2452,9 @@ __m256d test_mm512_extractf64x4_pd(__m512d a)
// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm512_extractf64x4_pd(a, 1);
}
-TEST_CONSTEXPR(match_m256d(_mm512_extractf64x4_pd(((__m512d){
- 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0
- }), 1),
- 4.0, 5.0, 6.0, 7.0));
-
+TEST_CONSTEXPR(match_m256d(_mm512_extractf64x4_pd(((__m512d)
+{0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}),1),
+ 4.0, 5.0, 6.0, 7.0));
__m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){
// CHECK-LABEL: test_mm512_mask_extractf64x4_pd
@@ -2464,12 +2462,13 @@ __m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
}
-TEST_CONSTEXPR(match_m256d(_mm512_mask_extractf64x4_pd(
- (__m256d){100.0,101.0,102.0,103.0}, // W(merge)
- (__mmask8)0x5, // 0101b
- (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
- 1),
- 4.0, 101.0, 6.0, 103.0));
+TEST_CONSTEXPR(match_m256d(
+ _mm512_mask_extractf64x4_pd(
+ ((__m256d){100.0,101.0,102.0,103.0}), // W (merge)
+ (__mmask8)0x5,
+ ((__m512d){0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}),
+ 1),
+ 4.0, 101.0, 6.0, 103.0));
__m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){
// CHECK-LABEL: test_mm512_maskz_extractf64x4_pd
@@ -2477,11 +2476,12 @@ __m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
}
-TEST_CONSTEXPR(match_m256d(_mm512_maskz_extractf64x4_pd(
- (__mmask8)0x3,
- (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0},
- 1),
- 4.0, 5.0, 0.0, 0.0));
+TEST_CONSTEXPR(match_m256d(
+ _mm512_maskz_extractf64x4_pd(
+ (__mmask8)0x3,
+ ((__m512d){0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}),
+ 1),
+ 4.0, 5.0, 0.0, 0.0));
__m128 test_mm512_extractf32x4_ps(__m512 a)
{
@@ -2489,9 +2489,9 @@ __m128 test_mm512_extractf32x4_ps(__m512 a)
// CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm512_extractf32x4_ps(a, 1);
}
-TEST_CONSTEXPR(match_m128(_mm512_extractf32x4_ps(((__m512){
- 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
- }), 1),
+TEST_CONSTEXPR(match_m128(_mm512_extractf32x4_ps(
+ ((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
+ 1),
4.0f, 5.0f, 6.0f, 7.0f));
__m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){
@@ -2501,9 +2501,9 @@ __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){
return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
}
TEST_CONSTEXPR(match_m128(_mm512_mask_extractf32x4_ps(
- (__m128){100,101,102,103}, // W(merge)
- (__mmask8)0x5, // 0101b
- (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15},
+ ((__m128){100,101,102,103}),
+ (__mmask8)0x5,
+ ((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
1),
4.0f, 101.0f, 6.0f, 103.0f));
@@ -2515,7 +2515,7 @@ __m128 test_mm512_maskz_extractf32x4_ps( __mmask8 __U,__m512 __A){
}
TEST_CONSTEXPR(match_m128(_mm512_maskz_extractf32x4_ps(
(__mmask8)0x3,
- (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15},
+ ((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),
1),
4.0f, 5.0f, 0.0f, 0.0f));
@@ -7388,10 +7388,11 @@ __m128i test_mm512_extracti32x4_epi32(__m512i __A) {
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
return _mm512_extracti32x4_epi32(__A, 3);
}
-TEST_CONSTEXPR(match_m128i(_mm512_extracti32x4_epi32(((__m512i){
- 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
- }), 3),
- 12, 13, 14, 15));
+TEST_CONSTEXPR(match_m128i(_mm512_extracti32x4_epi32(((__m512i)(__v16si)
+ {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 3),
+ 0x0000000D0000000CULL, // (13<<32)|12
+ 0x0000000F0000000EULL
+ ));
__m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_mask_extracti32x4_epi32
@@ -7400,14 +7401,16 @@ __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __
return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3);
}
TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti32x4_epi32(
- (__m128i){100,101,102,103}, // merge=W
+ ((__m128i)(__v4si){100,101,102,103}), // merge=W
(__mmask8)0x5, // 0101b
- (__m512i){
+ ((__m512i)(__v16si){
0,1,2,3, 4,5,6,7,
8,9,10,11, 12,13,14,15
- },
+ }),
3),
- 12, 101, 14, 103));
+ 0x000000650000000CULL, // (101<<32)|12
+ 0x000000670000000EULL // (103<<32)|14
+ ));
__m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
// CHECK-LABEL: test_mm512_maskz_extracti32x4_epi32
@@ -7417,21 +7420,19 @@ __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
}
TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti32x4_epi32(
(__mmask8)0x3,
- (__m512i){
- 0,1,2,3, 4,5,6,7,
- 8,9,10,11, 12,13,14,15
- },
+ ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),
3),
-12, 13, 0, 0));
+ 0x0000000D0000000CULL, // (13<<32)|12
+ 0x0000000000000000ULL
+ ));
__m256i test_mm512_extracti64x4_epi64(__m512i __A) {
// CHECK-LABEL: test_mm512_extracti64x4_epi64
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm512_extracti64x4_epi64(__A, 1);
}
-TEST_CONSTEXPR(match_m256i(_mm512_extracti64x4_epi64(((__m512i){
- 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL
- }), 1),
+TEST_CONSTEXPR(match_m256i(
+ _mm512_extracti64x4_epi64(((__m512i)(__v8di){0,1,2,3,4,5,6,7}), 1),
4ULL, 5ULL, 6ULL, 7ULL));
__m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
@@ -7440,10 +7441,10 @@ __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1);
}
-TEST_CONSTEXPR(match_m256i_64(_mm512_mask_extracti64x4_epi64(
- (__m256i){100ULL,101ULL,102ULL,103ULL},
+TEST_CONSTEXPR(match_m256i(_mm512_mask_extracti64x4_epi64(
+ ((__m256i)(__v4di){100ULL,101ULL,102ULL,103ULL}), // W
(__mmask8)0x5,
- (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+ (((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL})),
1),
4ULL, 101ULL, 6ULL, 103ULL));
@@ -7455,7 +7456,7 @@ __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
}
TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti64x4_epi64(
(__mmask8)0x3,
- (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL},
+ (((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL})),
1),
4ULL, 5ULL, 0ULL, 0ULL));
>From 86e46dbaf94226966208496a731a0af83f032efa Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Tue, 30 Sep 2025 00:20:27 +0900
Subject: [PATCH 18/21] Add test and refactoring
---
clang/lib/Headers/avx512vldqintrin.h | 4 +-
clang/lib/Headers/avx512vlintrin.h | 4 +-
clang/test/CodeGen/X86/avx512vl-builtins.c | 64 +++++++++++---------
clang/test/CodeGen/X86/avx512vldq-builtins.c | 18 +++---
4 files changed, 48 insertions(+), 42 deletions(-)
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 68bd52e43981a..2d3c4b551e3b0 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -1075,7 +1075,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
#define _mm256_extractf64x2_pd(A, imm) \
((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
(int)(imm), \
- (__v2df)_mm_undefined_pd(), \
+ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1))
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
@@ -1093,7 +1093,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
#define _mm256_extracti64x2_epi64(A, imm) \
((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
(int)(imm), \
- (__v2di)_mm_undefined_si128(), \
+ (__v2di)_mm_setzero_si128(), \
(__mmask8)-1))
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 965741f0ff944..252fb111988b0 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -7609,7 +7609,7 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
#define _mm256_extractf32x4_ps(A, imm) \
((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
(int)(imm), \
- (__v4sf)_mm_undefined_ps(), \
+ (__v4sf)_mm_setzero_ps(), \
(__mmask8)-1))
#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
@@ -7627,7 +7627,7 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
#define _mm256_extracti32x4_epi32(A, imm) \
((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
(int)(imm), \
- (__v4si)_mm_undefined_si128(), \
+ (__v4si)_mm_setzero_si128(), \
(__mmask8)-1))
#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 323ac1b2cab63..4e2a31a26868a 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -9875,9 +9875,8 @@ __m128 test_mm256_extractf32x4_ps(__m256 __A) {
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extractf32x4_ps(__A, 1);
}
-TEST_CONSTEXPR(match_m128(_mm256_extractf32x4_ps(((__m256){
- 0,1,2,3, 4,5,6,7
- }), 1),
+TEST_CONSTEXPR(match_m128(
+ _mm256_extractf32x4_ps(((__m256){0,1,2,3, 4,5,6,7}), 1),
4.0f, 5.0f, 6.0f, 7.0f));
__m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
@@ -9886,12 +9885,13 @@ __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1);
}
-TEST_CONSTEXPR( match_m128(_mm256_mask_extractf32x4_ps(
- (__m128){100,101,102,103}, // W (merge)
- (__mmask8)0x5, // 0101b
- (__m256){0,1,2,3, 4,5,6,7},
- 1),
- 4.0f, 101.0f, 6.0f, 103.0f));
+TEST_CONSTEXPR(match_m128(
+ _mm256_mask_extractf32x4_ps(
+ (((__m128){100,101,102,103})),
+ (__mmask8)0x5,
+ (((__m256){0,1,2,3, 4,5,6,7})),
+ 1),
+ 4.0f, 101.0f, 6.0f, 103.0f));
__m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
// CHECK-LABEL: test_mm256_maskz_extractf32x4_ps
@@ -9899,21 +9899,23 @@ __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm256_maskz_extractf32x4_ps(__U, __A, 1);
}
-TEST_CONSTEXPR(match_m128(_mm256_maskz_extractf32x4_ps(
- (__mmask8)0x3,
- (__m256){0,1,2,3, 4,5,6,7},
- 1),
- 4.0f, 5.0f, 0.0f, 0.0f));
+TEST_CONSTEXPR(match_m128(
+ _mm256_maskz_extractf32x4_ps(
+ (__mmask8)0x3,
+ (((__m256){0,1,2,3, 4,5,6,7})),
+ 1),
+ 4.0f, 5.0f, 0.0f, 0.0f));
__m128i test_mm256_extracti32x4_epi32(__m256i __A) {
// CHECK-LABEL: test_mm256_extracti32x4_epi32
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
return _mm256_extracti32x4_epi32(__A, 1);
}
-TEST_CONSTEXPR(match_m128i(_mm256_extracti32x4_epi32(((__m256i){
- 0,1,2,3, 4,5,6,7
- }), 1),
- 4, 5, 6, 7));
+TEST_CONSTEXPR(match_m128i(
+ _mm256_extracti32x4_epi32(
+ (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})), 1),
+ 0x0000000500000004ULL,
+ 0x0000000700000006ULL));
__m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
// CHECK-LABEL: test_mm256_mask_extracti32x4_epi32
@@ -9921,12 +9923,14 @@ __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1);
}
-TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti32x4_epi32(
- (__m128i){100,101,102,103}, // W (merge)
- (__mmask8)0xA, // 1010b
- (__m256i){0,1,2,3, 4,5,6,7},
- 1),
- 100, 5, 102, 7));
+TEST_CONSTEXPR(match_m128i(
+ _mm256_mask_extracti32x4_epi32(
+ (((__m128i)(__v4si){100,101,102,103})),
+ (__mmask8)0xA,
+ (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})),
+ 1),
+ 0x0000000500000064ULL,
+ 0x0000000700000066ULL));
__m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
// CHECK-LABEL: test_mm256_maskz_extracti32x4_epi32
@@ -9934,11 +9938,13 @@ __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm256_maskz_extracti32x4_epi32(__U, __A, 1);
}
-TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti32x4_epi32(
- (__mmask8)0x3,
- (__m256i){0,1,2,3, 4,5,6,7},
- 1),
- 4, 5, 0, 0));
+TEST_CONSTEXPR(match_m128i(
+ _mm256_maskz_extracti32x4_epi32(
+ (__mmask8)0x3,
+ (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})),
+ 1),
+ 0x0000000500000004ULL,
+ 0x0000000000000000ULL));
__m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) {
// CHECK-LABEL: test_mm256_insertf32x4
diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c
index 9cfcfea3dafc7..d566363d1f291 100644
--- a/clang/test/CodeGen/X86/avx512vldq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c
@@ -1093,9 +1093,9 @@ __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A)
return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1);
}
TEST_CONSTEXPR(match_m128d(_mm256_mask_extractf64x2_pd(
- (__m128d){100.0, 101.0}, // W(merge)
+ (((__m128d){100.0, 101.0})), // W(merge)
(__mmask8)0x1,
- (__m256d){0.0,1.0,2.0,3.0},
+ (((__m256d){0.0,1.0,2.0,3.0})),
1),
2.0, 101.0));
@@ -1107,7 +1107,7 @@ __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
}
TEST_CONSTEXPR(match_m128d(_mm256_maskz_extractf64x2_pd(
(__mmask8)0x2,
- (__m256d){0.0,1.0,2.0,3.0},
+ (((__m256d){0.0,1.0,2.0,3.0})),
1),
0.0, 3.0));
@@ -1116,7 +1116,7 @@ __m128i test_mm256_extracti64x2_epi64(__m256i __A) {
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
return _mm256_extracti64x2_epi64(__A, 1);
}
-TEST_CONSTEXPR(match_m128i_64(_mm256_extracti64x2_epi64(((__m256i){0ULL,1ULL,2ULL,3ULL}), 1),
+TEST_CONSTEXPR(match_m128i(_mm256_extracti64x2_epi64(((__m256i){0ULL,1ULL,2ULL,3ULL}), 1),
2ULL, 3ULL));
__m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
@@ -1125,10 +1125,10 @@ __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1);
}
-TEST_CONSTEXPR(match_m128i_64(_mm256_mask_extracti64x2_epi64(
- (__m128i){100ULL, 101ULL}, // W(merge)
+TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti64x2_epi64(
+ (((__m128i){100ULL, 101ULL})), // W(merge)
(__mmask8)0x1,
- (__m256i){0ULL,1ULL,2ULL,3ULL},
+ (((__m256i){0ULL,1ULL,2ULL,3ULL})),
1),
2ULL, 101ULL));
@@ -1138,9 +1138,9 @@ __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm256_maskz_extracti64x2_epi64(__U, __A, 1);
}
-TEST_CONSTEXPR(match_m128i_64(_mm256_maskz_extracti64x2_epi64(
+TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti64x2_epi64(
(__mmask8)0x2,
- (__m256i){0ULL,1ULL,2ULL,3ULL},
+ (((__m256i){0ULL,1ULL,2ULL,3ULL})),
1),
0ULL, 3ULL));
>From 93b6fb3f5d74f5bfde70bf1c6f9a3a272599bf3f Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Mon, 6 Oct 2025 19:30:04 +0900
Subject: [PATCH 19/21] Remove comment
---
clang/lib/AST/ExprConstant.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index ae59da87a3ccd..60d819c319084 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12100,7 +12100,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if ((K >> i) & 1)
ResultElements.push_back(A.getVectorElt(base + i));
else
- ResultElements.push_back(W.getVectorElt(i)); // maskz/unmasked 모두 헤더에서 맞춰줌
+ ResultElements.push_back(W.getVectorElt(i));
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
>From ce4577a23917068abcd9ac4acdda9fef06f5af8e Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Thu, 9 Oct 2025 01:53:26 +0900
Subject: [PATCH 20/21] Refactor review comments and remove unrelated files
---
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 45 +++++++++++-------------
clang/lib/AST/ExprConstant.cpp | 39 ++++++++++----------
2 files changed, 39 insertions(+), 45 deletions(-)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 05ef09b3cbaee..8bed87f027712 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2855,16 +2855,13 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC,
unsigned ID) {
assert(Call->getNumArgs() == 2);
- // srcimm
APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
uint64_t Index = ImmAPS.getZExtValue();
- // srcvec
const Pointer &Src = S.Stk.pop<Pointer>();
if (!Src.getFieldDesc()->isPrimitiveArray())
return false;
- // destination (return value)
const Pointer &Dst = S.Stk.peek<Pointer>();
if (!Dst.getFieldDesc()->isPrimitiveArray())
return false;
@@ -2879,12 +2876,11 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC,
unsigned Lane = static_cast<unsigned>(Index % NumLanes);
unsigned ExtractPos = Lane * DstElems;
- // element type
- PrimType ElemPT = Src.getFieldDesc()->getPrimType();
- if (ElemPT != Dst.getFieldDesc()->getPrimType())
+ PrimType ElemT = Src.getFieldDesc()->getPrimType();
+ if (ElemT != Dst.getFieldDesc()->getPrimType())
return false;
- TYPE_SWITCH(ElemPT, {
+ TYPE_SWITCH(ElemT, {
for (unsigned I = 0; I != DstElems; ++I) {
Dst.elem<T>(I) = Src.elem<T>(ExtractPos + I);
}
@@ -2899,41 +2895,40 @@ static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr Op
unsigned ID) {
assert(Call->getNumArgs() == 4);
- APSInt UAPS = popToAPSInt(S, Call->getArg(3));
- const Pointer &W = S.Stk.pop<Pointer>();
+ APSInt MaskAPS = popToAPSInt(S, Call->getArg(3));
+ const Pointer &Merge = S.Stk.pop<Pointer>();
APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
- const Pointer &A = S.Stk.pop<Pointer>();
+ const Pointer &Src = S.Stk.pop<Pointer>();
- if (!A.getFieldDesc()->isPrimitiveArray() || !W.getFieldDesc()->isPrimitiveArray())
+ if (!Src.getFieldDesc()->isPrimitiveArray() || !Merge.getFieldDesc()->isPrimitiveArray())
return false;
const Pointer &Dst = S.Stk.peek<Pointer>();
if (!Dst.getFieldDesc()->isPrimitiveArray())
return false;
- unsigned SrcElems = A.getNumElems();
+ unsigned SrcElems = Src.getNumElems();
unsigned DstElems = Dst.getNumElems();
if (!SrcElems || !DstElems || (SrcElems % DstElems) != 0)
return false;
- // 타입 일치 체크
- PrimType PT = A.getFieldDesc()->getPrimType();
- if (PT != Dst.getFieldDesc()->getPrimType() ||
- PT != W.getFieldDesc()->getPrimType())
+ PrimType ElemT = Src.getFieldDesc()->getPrimType();
+ if (ElemT != Dst.getFieldDesc()->getPrimType() ||
+ ElemT != Merge.getFieldDesc()->getPrimType())
return false;
- unsigned numLanes = SrcElems / DstElems;
- unsigned lane = static_cast<unsigned>(ImmAPS.getZExtValue() % numLanes);
- unsigned base = lane * DstElems;
+ unsigned NumLanes = SrcElems / DstElems;
+ unsigned Lane = static_cast<unsigned>(ImmAPS.getZExtValue() % NumLanes);
+ unsigned Base = Lane * DstElems;
- uint64_t U = UAPS.getZExtValue();
+ uint64_t Mask = MaskAPS.getZExtValue();
- TYPE_SWITCH(PT, {
- for (unsigned i = 0; i < DstElems; ++i) {
- if ((U >> i) & 1)
- Dst.elem<T>(i) = A.elem<T>(base + i);
+ TYPE_SWITCH(ElemT, {
+ for (unsigned I = 0; I < DstElems; ++I) {
+ if ((Mask >> I) & 1)
+ Dst.elem<T>(I) = Src.elem<T>(Base + I);
else
- Dst.elem<T>(i) = W.elem<T>(i);
+ Dst.elem<T>(I) = Merge.elem<T>(I);
}
});
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 60d819c319084..25ded19554518 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12049,13 +12049,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (SrcLen != RetLen * 2)
return false;
- unsigned idx = SourceImm.getInt().getZExtValue() & 1;
+ unsigned Idx = SourceImm.getInt().getZExtValue() & 1;
SmallVector<APValue, 32> ResultElements;
ResultElements.reserve(RetLen);
- for (unsigned i = 0; i < RetLen; i++)
- ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i));
+ for (unsigned I = 0; I < RetLen; I++)
+ ResultElements.push_back(SourceVec.getVectorElt(Idx * RetLen + I));
return Success(APValue(ResultElements.data(), RetLen), E);
}
@@ -12072,35 +12072,34 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case X86::BI__builtin_ia32_extractf64x2_512_mask:
case X86::BI__builtin_ia32_extracti64x4_mask:
case X86::BI__builtin_ia32_extractf64x4_mask:{
- APValue A, W;
- APSInt Imm, U;
+ APValue SourceVec, MergeVec;
+ APSInt Imm, MaskImm;
- if (!EvaluateAsRValue(Info, E->getArg(0), A) || // A
- !EvaluateInteger(E->getArg(1), Imm, Info) || // imm
- !EvaluateAsRValue(Info, E->getArg(2), W) || // W (merge)
- !EvaluateInteger(E->getArg(3), U, Info)) // U (mask)
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
+ !EvaluateInteger(E->getArg(1), Imm, Info) ||
+ !EvaluateAsRValue(Info, E->getArg(2), MergeVec) ||
+ !EvaluateInteger(E->getArg(3), MaskImm, Info))
return false;
const auto *RetVT = E->getType()->castAs<VectorType>();
- // QualType EltTy = RetVT->getElementType();
unsigned RetLen = RetVT->getNumElements();
- if (!A.isVector() || !W.isVector()) return false;
- unsigned SrcLen = A.getVectorLength();
+ if (!SourceVec.isVector() || !MergeVec.isVector()) return false;
+ unsigned SrcLen = SourceVec.getVectorLength();
if (!SrcLen || !RetLen || (SrcLen % RetLen) != 0) return false;
- unsigned lanes = SrcLen / RetLen;
- unsigned lane = static_cast<unsigned>(Imm.getZExtValue() % lanes);
- unsigned base = lane * RetLen;
- uint64_t K = U.getZExtValue();
+ unsigned Lanes = SrcLen / RetLen;
+ unsigned Lane = static_cast<unsigned>(Imm.getZExtValue() % Lanes);
+ unsigned Base = Lane * RetLen;
+ uint64_t Mask = MaskImm.getZExtValue();
SmallVector<APValue, 32> ResultElements;
ResultElements.reserve(RetLen);
- for (unsigned i = 0; i < RetLen; ++i) {
- if ((K >> i) & 1)
- ResultElements.push_back(A.getVectorElt(base + i));
+ for (unsigned I = 0; I < RetLen; ++I) {
+ if ((Mask >> I) & 1)
+ ResultElements.push_back(SourceVec.getVectorElt(Base + I));
else
- ResultElements.push_back(W.getVectorElt(i));
+ ResultElements.push_back(MergeVec.getVectorElt(I));
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
>From 24e06be02a1133e834c54e4aac813b274c732dc4 Mon Sep 17 00:00:00 2001
From: SeongjaeP <psjj960507 at gmail.com>
Date: Thu, 9 Oct 2025 14:25:24 +0900
Subject: [PATCH 21/21] Apply style fixes and rebase onto upstream
---
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 314 +++++------------------
clang/lib/AST/ExprConstant.cpp | 6 +-
2 files changed, 71 insertions(+), 249 deletions(-)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8bed87f027712..c8479b9b09a17 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -678,6 +678,30 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC,
return true;
}
+static bool interp__builtin_parity(InterpState &S, CodePtr OpPC,
+ const InterpFrame *Frame,
+ const CallExpr *Call) {
+ APSInt Val = popToAPSInt(S, Call->getArg(0));
+ pushInteger(S, Val.popcount() % 2, Call->getType());
+ return true;
+}
+
+static bool interp__builtin_clrsb(InterpState &S, CodePtr OpPC,
+ const InterpFrame *Frame,
+ const CallExpr *Call) {
+ APSInt Val = popToAPSInt(S, Call->getArg(0));
+ pushInteger(S, Val.getBitWidth() - Val.getSignificantBits(), Call->getType());
+ return true;
+}
+
+static bool interp__builtin_bitreverse(InterpState &S, CodePtr OpPC,
+ const InterpFrame *Frame,
+ const CallExpr *Call) {
+ APSInt Val = popToAPSInt(S, Call->getArg(0));
+ pushInteger(S, Val.reverseBits(), Call->getType());
+ return true;
+}
+
static bool interp__builtin_classify_type(InterpState &S, CodePtr OpPC,
const InterpFrame *Frame,
const CallExpr *Call) {
@@ -2310,14 +2334,10 @@ static bool interp__builtin_object_size(InterpState &S, CodePtr OpPC,
if (Ptr.isBaseClass())
ByteOffset = computePointerOffset(ASTCtx, Ptr.getBase()) -
computePointerOffset(ASTCtx, Ptr);
- else {
- if (Ptr.inArray())
- ByteOffset =
- computePointerOffset(ASTCtx, Ptr) -
- computePointerOffset(ASTCtx, Ptr.expand().atIndex(0).narrow());
- else
- ByteOffset = 0;
- }
+ else
+ ByteOffset =
+ computePointerOffset(ASTCtx, Ptr) -
+ computePointerOffset(ASTCtx, Ptr.expand().atIndex(0).narrow());
} else
ByteOffset = computePointerOffset(ASTCtx, Ptr);
@@ -2579,11 +2599,9 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
return true;
}
-static bool interp__builtin_ia32_pmul(
- InterpState &S, CodePtr OpPC, const CallExpr *Call,
- llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &,
- const APSInt &)>
- Fn) {
+static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call,
+ unsigned BuiltinID) {
assert(Call->getArg(0)->getType()->isVectorType() &&
Call->getArg(1)->getType()->isVectorType());
const Pointer &RHS = S.Stk.pop<Pointer>();
@@ -2592,23 +2610,35 @@ static bool interp__builtin_ia32_pmul(
const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
PrimType ElemT = *S.getContext().classify(VT->getElementType());
- unsigned NumElems = VT->getNumElements();
- const auto *DestVT = Call->getType()->castAs<VectorType>();
- PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
- bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+ unsigned SourceLen = VT->getNumElements();
+ PrimType DstElemT = *S.getContext().classify(
+ Call->getType()->castAs<VectorType>()->getElementType());
unsigned DstElem = 0;
- for (unsigned I = 0; I != NumElems; I += 2) {
- APSInt Result;
+ for (unsigned I = 0; I != SourceLen; I += 2) {
+ APSInt Elem1;
+ APSInt Elem2;
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
- APSInt LoLHS = LHS.elem<T>(I).toAPSInt();
- APSInt HiLHS = LHS.elem<T>(I + 1).toAPSInt();
- APSInt LoRHS = RHS.elem<T>(I).toAPSInt();
- APSInt HiRHS = RHS.elem<T>(I + 1).toAPSInt();
- Result = APSInt(Fn(LoLHS, HiLHS, LoRHS, HiRHS), DestUnsigned);
+ Elem1 = LHS.elem<T>(I).toAPSInt();
+ Elem2 = RHS.elem<T>(I).toAPSInt();
});
- INT_TYPE_SWITCH_NO_BOOL(DestElemT,
+ APSInt Result;
+ switch (BuiltinID) {
+ case clang::X86::BI__builtin_ia32_pmuludq128:
+ case clang::X86::BI__builtin_ia32_pmuludq256:
+ case clang::X86::BI__builtin_ia32_pmuludq512:
+ Result = APSInt(llvm::APIntOps::muluExtended(Elem1, Elem2),
+ /*IsUnsigned=*/true);
+ break;
+ case clang::X86::BI__builtin_ia32_pmuldq128:
+ case clang::X86::BI__builtin_ia32_pmuldq256:
+ case clang::X86::BI__builtin_ia32_pmuldq512:
+ Result = APSInt(llvm::APIntOps::mulsExtended(Elem1, Elem2),
+ /*IsUnsigned=*/false);
+ break;
+ }
+ INT_TYPE_SWITCH_NO_BOOL(DstElemT,
{ Dst.elem<T>(DstElem) = static_cast<T>(Result); });
++DstElem;
}
@@ -2744,48 +2774,6 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
return true;
}
-static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
- const CallExpr *Call, bool IsShufHW) {
- assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
- APSInt ControlImm = popToAPSInt(S, Call->getArg(1));
- const Pointer &Src = S.Stk.pop<Pointer>();
- const Pointer &Dst = S.Stk.peek<Pointer>();
-
- unsigned NumElems = Dst.getNumElems();
- PrimType ElemT = Dst.getFieldDesc()->getPrimType();
-
- unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
- if (ElemBits != 16 && ElemBits != 32)
- return false;
-
- unsigned LaneElts = 128u / ElemBits;
- assert(LaneElts && (NumElems % LaneElts == 0));
-
- uint8_t Ctl = static_cast<uint8_t>(ControlImm.getZExtValue());
-
- for (unsigned Idx = 0; Idx != NumElems; Idx++) {
- unsigned LaneBase = (Idx / LaneElts) * LaneElts;
- unsigned LaneIdx = Idx % LaneElts;
- unsigned SrcIdx = Idx;
- unsigned Sel = (Ctl >> (2 * (LaneIdx & 0x3))) & 0x3;
- if (ElemBits == 32) {
- SrcIdx = LaneBase + Sel;
- } else {
- constexpr unsigned HalfSize = 4;
- bool InHigh = LaneIdx >= HalfSize;
- if (!IsShufHW && !InHigh) {
- SrcIdx = LaneBase + Sel;
- } else if (IsShufHW && InHigh) {
- SrcIdx = LaneBase + HalfSize + Sel;
- }
- }
-
- INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(Idx) = Src.elem<T>(SrcIdx); });
- }
- Dst.initializeAllElements();
- return true;
-}
-
static bool interp__builtin_elementwise_triop(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &)>
@@ -2918,13 +2906,13 @@ static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr Op
return false;
unsigned NumLanes = SrcElems / DstElems;
- unsigned Lane = static_cast<unsigned>(ImmAPS.getZExtValue() % NumLanes);
- unsigned Base = Lane * DstElems;
+ unsigned Lane = static_cast<unsigned>(ImmAPS.getZExtValue() % NumLanes);
+ unsigned Base = Lane * DstElems;
uint64_t Mask = MaskAPS.getZExtValue();
TYPE_SWITCH(ElemT, {
- for (unsigned I = 0; I < DstElems; ++I) {
+ for (unsigned I = 0; I != DstElems; ++I) {
if ((Mask >> I) & 1)
Dst.elem<T>(I) = Src.elem<T>(Base + I);
else
@@ -2975,104 +2963,7 @@ static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
});
Dst.initializeAllElements();
- return true;
-}
-static bool interp__builtin_ia32_pternlog(InterpState &S, CodePtr OpPC,
- const CallExpr *Call, bool MaskZ) {
- assert(Call->getNumArgs() == 5);
-
- APInt U = popToAPSInt(S, Call->getArg(4)); // Lane mask
- APInt Imm = popToAPSInt(S, Call->getArg(3)); // Ternary truth table
- const Pointer &C = S.Stk.pop<Pointer>();
- const Pointer &B = S.Stk.pop<Pointer>();
- const Pointer &A = S.Stk.pop<Pointer>();
- const Pointer &Dst = S.Stk.peek<Pointer>();
-
- unsigned DstLen = A.getNumElems();
- const QualType ElemQT = getElemType(A);
- const OptPrimType ElemPT = S.getContext().classify(ElemQT);
- unsigned LaneWidth = S.getASTContext().getTypeSize(ElemQT);
- bool DstUnsigned = ElemQT->isUnsignedIntegerOrEnumerationType();
-
- INT_TYPE_SWITCH_NO_BOOL(*ElemPT, {
- for (unsigned I = 0; I != DstLen; ++I) {
- APInt ALane = A.elem<T>(I).toAPSInt();
- APInt BLane = B.elem<T>(I).toAPSInt();
- APInt CLane = C.elem<T>(I).toAPSInt();
- APInt RLane(LaneWidth, 0);
- if (U[I]) { // If lane not masked, compute ternary logic.
- for (unsigned Bit = 0; Bit != LaneWidth; ++Bit) {
- unsigned ABit = ALane[Bit];
- unsigned BBit = BLane[Bit];
- unsigned CBit = CLane[Bit];
- unsigned Idx = (ABit << 2) | (BBit << 1) | (CBit);
- RLane.setBitVal(Bit, Imm[Idx]);
- }
- Dst.elem<T>(I) = static_cast<T>(APSInt(RLane, DstUnsigned));
- } else if (MaskZ) { // If zero masked, zero the lane.
- Dst.elem<T>(I) = static_cast<T>(APSInt(RLane, DstUnsigned));
- } else { // Just masked, put in A lane.
- Dst.elem<T>(I) = static_cast<T>(APSInt(ALane, DstUnsigned));
- }
- }
- });
- Dst.initializeAllElements();
- return true;
-}
-
-static bool interp__builtin_vec_ext(InterpState &S, CodePtr OpPC,
- const CallExpr *Call, unsigned ID) {
- assert(Call->getNumArgs() == 2);
-
- APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
- const Pointer &Vec = S.Stk.pop<Pointer>();
- if (!Vec.getFieldDesc()->isPrimitiveArray())
- return false;
-
- unsigned NumElems = Vec.getNumElems();
- unsigned Index =
- static_cast<unsigned>(ImmAPS.getZExtValue() & (NumElems - 1));
-
- PrimType ElemPT = Vec.getFieldDesc()->getPrimType();
- // FIXME(#161685): Replace float+int split with a numeric-only type switch
- if (ElemPT == PT_Float) {
- S.Stk.push<Floating>(Vec.elem<Floating>(Index));
- return true;
- }
- INT_TYPE_SWITCH_NO_BOOL(ElemPT, {
- APSInt V = Vec.elem<T>(Index).toAPSInt();
- pushInteger(S, V, Call->getType());
- });
-
- return true;
-}
-
-static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
- const CallExpr *Call, unsigned ID) {
- assert(Call->getNumArgs() == 3);
-
- APSInt ImmAPS = popToAPSInt(S, Call->getArg(2));
- APSInt ValAPS = popToAPSInt(S, Call->getArg(1));
-
- const Pointer &Base = S.Stk.pop<Pointer>();
- if (!Base.getFieldDesc()->isPrimitiveArray())
- return false;
-
- const Pointer &Dst = S.Stk.peek<Pointer>();
-
- unsigned NumElems = Base.getNumElems();
- unsigned Index =
- static_cast<unsigned>(ImmAPS.getZExtValue() & (NumElems - 1));
-
- PrimType ElemPT = Base.getFieldDesc()->getPrimType();
- INT_TYPE_SWITCH_NO_BOOL(ElemPT, {
- for (unsigned I = 0; I != NumElems; ++I)
- Dst.elem<T>(I) = Base.elem<T>(I);
- Dst.elem<T>(Index) = static_cast<T>(ValAPS);
- });
-
- Dst.initializeAllElements();
return true;
}
@@ -3232,25 +3123,18 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case Builtin::BI__builtin_parity:
case Builtin::BI__builtin_parityl:
case Builtin::BI__builtin_parityll:
- return interp__builtin_elementwise_int_unaryop(
- S, OpPC, Call, [](const APSInt &Val) -> APInt {
- return APInt(Val.getBitWidth(), Val.popcount() % 2);
- });
+ return interp__builtin_parity(S, OpPC, Frame, Call);
+
case Builtin::BI__builtin_clrsb:
case Builtin::BI__builtin_clrsbl:
case Builtin::BI__builtin_clrsbll:
- return interp__builtin_elementwise_int_unaryop(
- S, OpPC, Call, [](const APSInt &Val) -> APInt {
- return APInt(Val.getBitWidth(),
- Val.getBitWidth() - Val.getSignificantBits());
- });
+ return interp__builtin_clrsb(S, OpPC, Frame, Call);
+
case Builtin::BI__builtin_bitreverse8:
case Builtin::BI__builtin_bitreverse16:
case Builtin::BI__builtin_bitreverse32:
case Builtin::BI__builtin_bitreverse64:
- return interp__builtin_elementwise_int_unaryop(
- S, OpPC, Call,
- [](const APSInt &Val) -> APInt { return Val.reverseBits(); });
+ return interp__builtin_bitreverse(S, OpPC, Frame, Call);
case Builtin::BI__builtin_classify_type:
return interp__builtin_classify_type(S, OpPC, Frame, Call);
@@ -3268,10 +3152,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case Builtin::BI_rotl:
case Builtin::BI_lrotl:
case Builtin::BI_rotl64:
- return interp__builtin_elementwise_int_binop(
- S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
- return Value.rotl(Amount);
- });
+ return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/false);
case Builtin::BI__builtin_rotateright8:
case Builtin::BI__builtin_rotateright16:
@@ -3282,19 +3163,12 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case Builtin::BI_rotr:
case Builtin::BI_lrotr:
case Builtin::BI_rotr64:
- return interp__builtin_elementwise_int_binop(
- S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
- return Value.rotr(Amount);
- });
+ return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/true);
case Builtin::BI__builtin_ffs:
case Builtin::BI__builtin_ffsl:
case Builtin::BI__builtin_ffsll:
- return interp__builtin_elementwise_int_unaryop(
- S, OpPC, Call, [](const APSInt &Val) {
- return APInt(Val.getBitWidth(),
- Val.isZero() ? 0u : Val.countTrailingZeros() + 1u);
- });
+ return interp__builtin_ffs(S, OpPC, Frame, Call);
case Builtin::BIaddressof:
case Builtin::BI__addressof:
@@ -3604,7 +3478,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case clang::X86::BI__builtin_ia32_pmaddubsw128:
case clang::X86::BI__builtin_ia32_pmaddubsw256:
case clang::X86::BI__builtin_ia32_pmaddubsw512:
- return interp__builtin_ia32_pmul(
+ return interp__builtin_ia32_pmadd(
S, OpPC, Call,
[](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
const APSInt &HiRHS) {
@@ -3616,7 +3490,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case clang::X86::BI__builtin_ia32_pmaddwd128:
case clang::X86::BI__builtin_ia32_pmaddwd256:
case clang::X86::BI__builtin_ia32_pmaddwd512:
- return interp__builtin_ia32_pmul(
+ return interp__builtin_ia32_pmadd(
S, OpPC, Call,
[](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
const APSInt &HiRHS) {
@@ -3879,21 +3753,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_selectpd_512:
return interp__builtin_select(S, OpPC, Call);
- case X86::BI__builtin_ia32_pshuflw:
- case X86::BI__builtin_ia32_pshuflw256:
- case X86::BI__builtin_ia32_pshuflw512:
- return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
-
- case X86::BI__builtin_ia32_pshufhw:
- case X86::BI__builtin_ia32_pshufhw256:
- case X86::BI__builtin_ia32_pshufhw512:
- return interp__builtin_ia32_pshuf(S, OpPC, Call, true);
-
- case X86::BI__builtin_ia32_pshufd:
- case X86::BI__builtin_ia32_pshufd256:
- case X86::BI__builtin_ia32_pshufd512:
- return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
-
case X86::BI__builtin_ia32_kandqi:
case X86::BI__builtin_ia32_kandhi:
case X86::BI__builtin_ia32_kandsi:
@@ -3949,20 +3808,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
S, OpPC, Call,
[](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
- case X86::BI__builtin_ia32_pternlogd128_mask:
- case X86::BI__builtin_ia32_pternlogd256_mask:
- case X86::BI__builtin_ia32_pternlogd512_mask:
- case X86::BI__builtin_ia32_pternlogq128_mask:
- case X86::BI__builtin_ia32_pternlogq256_mask:
- case X86::BI__builtin_ia32_pternlogq512_mask:
- return interp__builtin_ia32_pternlog(S, OpPC, Call, /*MaskZ=*/false);
- case X86::BI__builtin_ia32_pternlogd128_maskz:
- case X86::BI__builtin_ia32_pternlogd256_maskz:
- case X86::BI__builtin_ia32_pternlogd512_maskz:
- case X86::BI__builtin_ia32_pternlogq128_maskz:
- case X86::BI__builtin_ia32_pternlogq256_maskz:
- case X86::BI__builtin_ia32_pternlogq512_maskz:
- return interp__builtin_ia32_pternlog(S, OpPC, Call, /*MaskZ=*/true);
case Builtin::BI__builtin_elementwise_fshl:
return interp__builtin_elementwise_triop(S, OpPC, Call,
llvm::APIntOps::fshl);
@@ -3988,29 +3833,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_insert128i256:
return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);
- case X86::BI__builtin_ia32_vec_ext_v4hi:
- case X86::BI__builtin_ia32_vec_ext_v16qi:
- case X86::BI__builtin_ia32_vec_ext_v8hi:
- case X86::BI__builtin_ia32_vec_ext_v4si:
- case X86::BI__builtin_ia32_vec_ext_v2di:
- case X86::BI__builtin_ia32_vec_ext_v32qi:
- case X86::BI__builtin_ia32_vec_ext_v16hi:
- case X86::BI__builtin_ia32_vec_ext_v8si:
- case X86::BI__builtin_ia32_vec_ext_v4di:
- case X86::BI__builtin_ia32_vec_ext_v4sf:
- return interp__builtin_vec_ext(S, OpPC, Call, BuiltinID);
-
- case X86::BI__builtin_ia32_vec_set_v4hi:
- case X86::BI__builtin_ia32_vec_set_v16qi:
- case X86::BI__builtin_ia32_vec_set_v8hi:
- case X86::BI__builtin_ia32_vec_set_v4si:
- case X86::BI__builtin_ia32_vec_set_v2di:
- case X86::BI__builtin_ia32_vec_set_v32qi:
- case X86::BI__builtin_ia32_vec_set_v16hi:
- case X86::BI__builtin_ia32_vec_set_v8si:
- case X86::BI__builtin_ia32_vec_set_v4di:
- return interp__builtin_vec_set(S, OpPC, Call, BuiltinID);
-
default:
S.FFDiag(S.Current->getLocation(OpPC),
diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 25ded19554518..281f9a36093ec 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12089,9 +12089,9 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (!SrcLen || !RetLen || (SrcLen % RetLen) != 0) return false;
unsigned Lanes = SrcLen / RetLen;
- unsigned Lane = static_cast<unsigned>(Imm.getZExtValue() % Lanes);
- unsigned Base = Lane * RetLen;
- uint64_t Mask = MaskImm.getZExtValue();
+ unsigned Lane = static_cast<unsigned>(Imm.getZExtValue() % Lanes);
+ unsigned Base = Lane * RetLen;
+ uint64_t Mask = MaskImm.getZExtValue();
SmallVector<APValue, 32> ResultElements;
ResultElements.reserve(RetLen);
More information about the cfe-commits
mailing list