[clang] [compiler-rt] [llvm] [X86] Add AVX512BMM support for AMD Zen 6 (znver6) (PR #182556)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 20 12:02:15 PST 2026
https://github.com/ganeshgit updated https://github.com/llvm/llvm-project/pull/182556
>From bfd7cdc558c48b73409784c80959f19e78e2a60f Mon Sep 17 00:00:00 2001
From: Ganesh Gopalasubramanian <Ganesh.Gopalasubramanian at amd.com>
Date: Fri, 20 Feb 2026 15:34:41 +0530
Subject: [PATCH] [X86] Add AVX512BMM support for AMD Zen 6 (znver6)
This patch adds support for AVX512BMM (Bit Matrix Multiply)
instruction set extension for AMD Zen 6 processors.
AVX512BMM includes three instructions:
- VBITREVB: Bit reverse within each byte
- VBMACOR: Bit matrix multiply with OR accumulation
- VBMACXOR: Bit matrix multiply with XOR accumulation
The following implementations for AVX512BMM are added:
- Define __AVX512BMM__ macro for znver6
- avx512bmmintrin.h, avx512bmmvlintrin.h header files
- Implement _mm_bitrev_epi8, _mm256_bitrev_epi8, _mm512_bitrev_epi8
- Implement _mm256/512_bmacor16x16x16 and bmacxor intrinsics
---
clang/docs/ReleaseNotes.rst | 14 +
clang/include/clang/Basic/BuiltinsX86.td | 10 +
clang/lib/Basic/Targets/X86.cpp | 6 +
clang/lib/Basic/Targets/X86.h | 1 +
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 24 ++
clang/lib/Headers/CMakeLists.txt | 2 +
clang/lib/Headers/avx512bmmintrin.h | 176 +++++++++++++
clang/lib/Headers/avx512bmmvlintrin.h | 245 ++++++++++++++++++
clang/lib/Headers/immintrin.h | 4 +
clang/test/CodeGen/attr-target-x86.c | 4 +-
compiler-rt/lib/builtins/cpu_model/x86.c | 3 +
llvm/include/llvm/IR/IntrinsicsX86.td | 18 ++
.../llvm/TargetParser/X86TargetParser.def | 1 +
llvm/lib/Target/X86/X86.td | 6 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +
llvm/lib/Target/X86/X86ISelLowering.h | 4 +
llvm/lib/Target/X86/X86InstrAVX512.td | 61 +++++
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 3 +
llvm/lib/Target/X86/X86InstrPredicates.td | 2 +
llvm/lib/Target/X86/X86IntrinsicsInfo.h | 6 +
llvm/lib/TargetParser/Host.cpp | 1 +
llvm/lib/TargetParser/X86TargetParser.cpp | 3 +-
.../X86/avx512bmm-vbitrevb-bitreverse.ll | 85 ++++++
.../X86/avx512bmm-vbitrevb-intrinsics-mem.ll | 141 ++++++++++
.../X86/avx512bmm-vbitrevb-intrinsics.ll | 139 ++++++++++
.../CodeGen/X86/avx512bmm-vbmac-intrinsics.ll | 63 +++++
llvm/test/TableGen/x86-fold-tables.inc | 33 +++
.../gn/secondary/clang/lib/Headers/BUILD.gn | 2 +
28 files changed, 1065 insertions(+), 4 deletions(-)
create mode 100644 clang/lib/Headers/avx512bmmintrin.h
create mode 100644 clang/lib/Headers/avx512bmmvlintrin.h
create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll
create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll
create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll
create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8d5d704c1766a..84461b2fc211b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -335,6 +335,20 @@ NVPTX Support
X86 Support
^^^^^^^^^^^
- ``march=znver6`` is now supported.
+- Support ISA of ``AVX512BMM``.
+ * Support intrinsic of ``_mm512_bmacor16x16x16_v32hi``.
+ * Support intrinsic of ``_mm512_bmacxor16x16x16_v32hi``.
+ * Support intrinsic of ``_mm512_mask_bitrev_epi8``.
+ * Support intrinsic of ``_mm512_maskz_bitrev_epi8``.
+ * Support intrinsic of ``_mm512_bitrev_epi8``.
+ * Support intrinsic of ``_mm256_bmacor16x16x16_v16hi``.
+ * Support intrinsic of ``_mm256_bmacxor16x16x16_v16hi``.
+ * Support intrinsic of ``_mm_mask_bitrev_epi8``.
+ * Support intrinsic of ``_mm256_mask_bitrev_epi8``.
+ * Support intrinsic of ``_mm_maskz_bitrev_epi8``.
+ * Support intrinsic of ``_mm256_maskz_bitrev_epi8``.
+ * Support intrinsic of ``_mm_bitrev_epi8``.
+ * Support intrinsic of ``_mm256_bitrev_epi8``.
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 23eac47eb5e4c..0d4b40cc1791f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -5055,3 +5055,13 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
}
+
+let Features = "avx512bmm", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def bmacor16x16x16_v32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+ def bmacxor16x16x16_v32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512bmm,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+ def bmacor16x16x16_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+ def bmacxor16x16x16_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 6f88a428b1230..2c66d14f5f7b7 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -301,6 +301,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVX512DQ = true;
} else if (Feature == "+avx512bitalg") {
HasAVX512BITALG = true;
+ } else if (Feature == "+avx512bmm") {
+ HasAVX512BMM = true;
} else if (Feature == "+avx512bw") {
HasAVX512BW = true;
} else if (Feature == "+avx512vl") {
@@ -841,6 +843,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVX512DQ__");
if (HasAVX512BITALG)
Builder.defineMacro("__AVX512BITALG__");
+ if (HasAVX512BMM)
+ Builder.defineMacro("__AVX512BMM__");
if (HasAVX512BW)
Builder.defineMacro("__AVX512BW__");
if (HasAVX512VL) {
@@ -1085,6 +1089,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx512fp16", true)
.Case("avx512dq", true)
.Case("avx512bitalg", true)
+ .Case("avx512bmm", true)
.Case("avx512bw", true)
.Case("avx512vl", true)
.Case("avx512vbmi", true)
@@ -1205,6 +1210,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx512fp16", HasAVX512FP16)
.Case("avx512dq", HasAVX512DQ)
.Case("avx512bitalg", HasAVX512BITALG)
+ .Case("avx512bmm", HasAVX512BMM)
.Case("avx512bw", HasAVX512BW)
.Case("avx512vl", HasAVX512VL)
.Case("avx512vbmi", HasAVX512VBMI)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 922e32906cd04..6bd55f9fbf4bb 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -104,6 +104,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAVX512BF16 = false;
bool HasAVX512DQ = false;
bool HasAVX512BITALG = false;
+ bool HasAVX512BMM = false;
bool HasAVX512BW = false;
bool HasAVX512VL = false;
bool HasAVX512VBMI = false;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index 9645ed87b8ef3..4807c66442c92 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2678,6 +2678,30 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn);
}
+ case X86::BI__builtin_ia32_bmacor16x16x16_v16hi:
+ case X86::BI__builtin_ia32_bmacor16x16x16_v32hi:
+ case X86::BI__builtin_ia32_bmacxor16x16x16_v16hi:
+ case X86::BI__builtin_ia32_bmacxor16x16x16_v32hi: {
+ Intrinsic::ID ID;
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_bmacor16x16x16_v16hi:
+ ID = Intrinsic::x86_avx512_vbmacor_v16hi;
+ break;
+ case X86::BI__builtin_ia32_bmacor16x16x16_v32hi:
+ ID = Intrinsic::x86_avx512_vbmacor_v32hi;
+ break;
+ case X86::BI__builtin_ia32_bmacxor16x16x16_v16hi:
+ ID = Intrinsic::x86_avx512_vbmacxor_v16hi;
+ break;
+ case X86::BI__builtin_ia32_bmacxor16x16x16_v32hi:
+ ID = Intrinsic::x86_avx512_vbmacxor_v32hi;
+ break;
+ }
+
+ return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
+ }
// packed comparison intrinsics
case X86::BI__builtin_ia32_cmpeqps:
case X86::BI__builtin_ia32_cmpeqpd:
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 95d20bbca79ac..5ea3cfa588f82 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -186,6 +186,8 @@ set(x86_files
avx2intrin.h
avx512bf16intrin.h
avx512bitalgintrin.h
+ avx512bmmintrin.h
+ avx512bmmvlintrin.h
avx512bwintrin.h
avx512cdintrin.h
avx512dqintrin.h
diff --git a/clang/lib/Headers/avx512bmmintrin.h b/clang/lib/Headers/avx512bmmintrin.h
new file mode 100644
index 0000000000000..ce4ada8742766
--- /dev/null
+++ b/clang/lib/Headers/avx512bmmintrin.h
@@ -0,0 +1,176 @@
+/*===-------- avx512bmmintrin.h - AVX512BMM intrinsics *------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===---------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BMMINTRIN_H
+#define _AVX512BMMINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512bmm"), \
+ __min_vector_width__(512)))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product
+/// into a third 16x16 bit matrix (which is also the destination).
+///
+/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit)
+/// matrices in bits [255:0] and [511:256]. The operation performs:
+/// \code{.operation}
+/// for i in 0 to 15
+/// for j in 0 to 15
+/// reduction_bit = __C[16*i+j]
+/// for k in 0 to 15
+/// reduction_bit |= __A[16*i+k] & __B[16*k+j]
+/// end for k
+/// dest[16*i+j] = reduction_bit
+/// end for j
+/// end for i
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction.
+///
+/// \param __A
+/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+/// lane).
+/// \param __B
+/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+/// lane).
+/// \param __C
+/// A 512-bit accumulator vector containing the initial values to OR with.
+/// \returns A 512-bit vector containing the accumulated result for each lane.
+/// \note This instruction does not support masking.
+static __inline __m512i __DEFAULT_FN_ATTRS _mm512_bmacor16x16x16(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_bmacor16x16x16_v32hi(
+ (__v32hi)__A, (__v32hi)__B, (__v32hi)__C);
+}
+
+/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product
+/// into a third 16x16 bit matrix (which is also the destination).
+///
+/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit)
+/// matrices in bits [255:0] and [511:256]. The operation performs:
+/// \code{.operation}
+/// for i in 0 to 15
+/// for j in 0 to 15
+/// reduction_bit = __C[16*i+j]
+/// for k in 0 to 15
+/// reduction_bit ^= __A[16*i+k] & __B[16*k+j]
+/// end for k
+/// dest[16*i+j] = reduction_bit
+/// end for j
+/// end for i
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction.
+///
+/// \param __A
+/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+/// lane).
+/// \param __B
+/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
+/// lane).
+/// \param __C
+/// A 512-bit accumulator vector containing the initial values to XOR with.
+/// \returns A 512-bit vector containing the accumulated result for each lane.
+/// \note This instruction does not support masking.
+static __inline __m512i __DEFAULT_FN_ATTRS _mm512_bmacxor16x16x16(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_bmacxor16x16x16_v32hi(
+ (__v32hi)__A, (__v32hi)__B, (__v32hi)__C);
+}
+
+/// Reverses the bits within each byte of the source vector.
+///
+/// For each byte in the source, reverses the order of its 8 bits to generate
+/// the corresponding destination byte. For example, 0b10110001 becomes
+/// 0b10001101.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __A
+/// A 512-bit vector of [64 x i8] where each byte will have its bits
+/// reversed.
+/// \returns A 512-bit vector of [64 x i8] with bit-reversed bytes.
+static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_bitrev_epi8(__m512i __A) {
+ return (__m512i)__builtin_elementwise_bitreverse((__v64qi)__A);
+}
+
+/// Reverses the bits within each byte of the source vector, using a writemask
+/// to conditionally select elements.
+///
+/// For each byte position, if the corresponding mask bit is 1, the byte from
+/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
+/// the corresponding byte from \a B is copied to the result (merge masking).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __U
+/// A 64-bit mask value where each bit controls one byte (per 8-bit element).
+/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B.
+/// \param __A
+/// A 512-bit vector of [64 x i8] to be bit-reversed.
+/// \param __B
+/// A 512-bit vector of [64 x i8] providing passthrough values.
+/// \returns A 512-bit vector combining bit-reversed and passthrough bytes.
+static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_mask_bitrev_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512(
+ (__mmask64)__U, (__v64qi)_mm512_bitrev_epi8(__A), (__v64qi)__B);
+}
+
+/// Reverses the bits within each byte of the source vector, zeroing elements
+/// based on the writemask.
+///
+/// For each byte position, if the corresponding mask bit is 1, the byte from
+/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
+/// the result byte is set to zero (zero masking).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __U
+/// A 64-bit mask value where each bit controls one byte (per 8-bit element).
+/// A 1 performs bit reversal; a 0 sets the byte to zero.
+/// \param __A
+/// A 512-bit vector of [64 x i8] to be bit-reversed.
+/// \returns A 512-bit vector with bit-reversed or zeroed bytes.
+static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_maskz_bitrev_epi8(__mmask64 __U, __m512i __A) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_bitrev_epi8(__A),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+
+#endif
diff --git a/clang/lib/Headers/avx512bmmvlintrin.h b/clang/lib/Headers/avx512bmmvlintrin.h
new file mode 100644
index 0000000000000..68a04db460047
--- /dev/null
+++ b/clang/lib/Headers/avx512bmmvlintrin.h
@@ -0,0 +1,245 @@
+/*===------------- avx512bmvlintrin.h - BMM intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx512bmmvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __BMMVLINTRIN_H
+#define __BMMVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512bmm,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512bmm,avx512vl"), __min_vector_width__(256)))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product
+/// into a third 16x16 bit matrix (which is also the destination).
+///
+/// For the 256-bit YMM form, the source registers/memory each contain a single
+/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs:
+/// \code{.operation}
+/// for i in 0 to 15
+/// for j in 0 to 15
+/// reduction_bit = __C[16*i+j]
+/// for k in 0 to 15
+/// reduction_bit |= __A[16*i+k] & __B[16*k+j]
+/// end for k
+/// dest[16*i+j] = reduction_bit
+/// end for j
+/// end for i
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction.
+///
+/// \param __A
+/// A 256-bit vector containing a 16x16 bit matrix.
+/// \param __B
+/// A 256-bit vector containing a 16x16 bit matrix.
+/// \param __C
+/// A 256-bit accumulator vector containing the initial values to OR with.
+/// \returns A 256-bit vector containing the accumulated result.
+/// \note This instruction does not support masking.
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_bmacor16x16x16(__m256i __A, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_bmacor16x16x16_v16hi(
+ (__v16hi)__A, (__v16hi)__B, (__v16hi)__C);
+}
+
+/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product
+/// into a third 16x16 bit matrix (which is also the destination).
+///
+/// For the 256-bit YMM form, the source registers/memory each contain a single
+/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs:
+/// \code{.operation}
+/// for i in 0 to 15
+/// for j in 0 to 15
+/// reduction_bit = __C[16*i+j]
+/// for k in 0 to 15
+/// reduction_bit ^= __A[16*i+k] & __B[16*k+j]
+/// end for k
+/// dest[16*i+j] = reduction_bit
+/// end for j
+/// end for i
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction.
+///
+/// \param __A
+/// A 256-bit vector containing a 16x16 bit matrix.
+/// \param __B
+/// A 256-bit vector containing a 16x16 bit matrix.
+/// \param __C
+/// A 256-bit accumulator vector containing the initial values to XOR with.
+/// \returns A 256-bit vector containing the accumulated result.
+/// \note This instruction does not support masking.
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_bmacxor16x16x16(__m256i __A, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_bmacxor16x16x16_v16hi(
+ (__v16hi)__A, (__v16hi)__B, (__v16hi)__C);
+}
+
+/// Reverses the bits within each byte of the source vector.
+///
+/// For each byte in the source, reverses the order of its 8 bits to generate
+/// the corresponding destination byte. For example, 0b10110001 becomes
+/// 0b10001101.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __A
+/// A 128-bit vector of [16 x i8] where each byte will have its bits
+/// reversed.
+/// \returns A 128-bit vector of [16 x i8] with bit-reversed bytes.
+static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm128_bitrev_epi8(__m128i __A) {
+ return (__m128i)__builtin_elementwise_bitreverse((__v16qi)__A);
+}
+
+/// Reverses the bits within each byte of the source vector.
+///
+/// For each byte in the source, reverses the order of its 8 bits to generate
+/// the corresponding destination byte. For example, 0b10110001 becomes
+/// 0b10001101.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __A
+/// A 256-bit vector of [32 x i8] where each byte will have its bits
+/// reversed.
+/// \returns A 256-bit vector of [32 x i8] with bit-reversed bytes.
+static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_bitrev_epi8(__m256i __A) {
+ return (__m256i)__builtin_elementwise_bitreverse((__v32qi)__A);
+}
+
+/// Reverses the bits within each byte of the source vector, using a writemask
+/// to conditionally select elements.
+///
+/// For each byte position, if the corresponding mask bit is 1, the byte from
+/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
+/// the corresponding byte from \a B is copied to the result (merge masking).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __U
+/// A 16-bit mask value where each bit controls one byte (per 8-bit element).
+/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B.
+/// \param __A
+/// A 128-bit vector of [16 x i8] to be bit-reversed.
+/// \param __B
+/// A 128-bit vector of [16 x i8] providing passthrough values.
+/// \returns A 128-bit vector combining bit-reversed and passthrough bytes.
+static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm128_mask_bitrev_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128(
+ (__mmask16)__U, (__v16qi)_mm128_bitrev_epi8(__A), (__v16qi)__B);
+}
+
+/// Reverses the bits within each byte of the source vector, using a writemask
+/// to conditionally select elements.
+///
+/// For each byte position, if the corresponding mask bit is 1, the byte from
+/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
+/// the corresponding byte from \a B is copied to the result (merge masking).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __U
+/// A 32-bit mask value where each bit controls one byte (per 8-bit element).
+/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B.
+/// \param __A
+/// A 256-bit vector of [32 x i8] to be bit-reversed.
+/// \param __B
+/// A 256-bit vector of [32 x i8] providing passthrough values.
+/// \returns A 256-bit vector combining bit-reversed and passthrough bytes.
+static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_bitrev_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256(
+ (__mmask32)__U, (__v32qi)_mm256_bitrev_epi8(__A), (__v32qi)__B);
+}
+
+/// Reverses the bits within each byte of the source vector, zeroing elements
+/// based on the writemask.
+///
+/// For each byte position, if the corresponding mask bit is 1, the byte from
+/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
+/// the result byte is set to zero (zero masking).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __U
+/// A 16-bit mask value where each bit controls one byte (per 8-bit element).
+/// A 1 performs bit reversal; a 0 sets the byte to zero.
+/// \param __A
+/// A 128-bit vector of [16 x i8] to be bit-reversed.
+/// \returns A 128-bit vector with bit-reversed or zeroed bytes.
+static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm128_maskz_bitrev_epi8(__mmask16 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm128_bitrev_epi8(__A),
+ (__v16qi)_mm_setzero_si128());
+}
+
+/// Reverses the bits within each byte of the source vector, zeroing elements
+/// based on the writemask.
+///
+/// For each byte position, if the corresponding mask bit is 1, the byte from
+/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
+/// the result byte is set to zero (zero masking).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
+///
+/// \param __U
+/// A 32-bit mask value where each bit controls one byte (per 8-bit element).
+/// A 1 performs bit reversal; a 0 sets the byte to zero.
+/// \param __A
+/// A 256-bit vector of [32 x i8] to be bit-reversed.
+/// \returns A 256-bit vector with bit-reversed or zeroed bytes.
+static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_bitrev_epi8(__mmask32 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_bitrev_epi8(__A),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 19064a4ff5cea..00107c44c3a55 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -58,6 +58,10 @@
#include <avx512bitalgintrin.h>
+#include <avx512bmmintrin.h>
+
+#include <avx512bmmvlintrin.h>
+
#include <avx512cdintrin.h>
#include <avx512vpopcntdqintrin.h>
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 474fa93629d89..6a110ce38605b 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -33,7 +33,7 @@ __attribute__((target("fpmath=387")))
void f_fpmath_387(void) {}
// CHECK-NOT: tune-cpu
-// CHECK: [[f_no_sse2]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: [[f_no_sse2]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bmm,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
__attribute__((target("no-sse2")))
void f_no_sse2(void) {}
@@ -41,7 +41,7 @@ void f_no_sse2(void) {}
__attribute__((target("sse4")))
void f_sse4(void) {}
-// CHECK: [[f_no_sse4]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: [[f_no_sse4]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bmm,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
__attribute__((target("no-sse4")))
void f_no_sse4(void) {}
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index a71078e9064d5..dbfebc4949655 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -232,6 +232,7 @@ enum ProcessorFeatures {
FEATURE_AMX_FP8 = 120,
FEATURE_MOVRS,
FEATURE_AMX_MOVRS,
+ FEATURE_AVX512BMM,
CPU_FEATURE_MAX
};
@@ -1150,6 +1151,8 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
// AMD cpuid bit for prefetchi is different from Intel
if (HasExtLeaf21 && ((EAX >> 20) & 1))
setFeature(FEATURE_PREFETCHI);
+ if (HasExtLeaf21 && ((EAX >> 23) & 1))
+ setFeature(FEATURE_AVX512BMM);
bool HasLeaf14 = MaxLevel >= 0x14 &&
!getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index b75a0485d6263..9f7619b2bb279 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -7341,4 +7341,22 @@ def int_x86_movrsdi : ClangBuiltin<"__builtin_ia32_movrsdi">,
[IntrReadMem]>;
def int_x86_prefetchrs : ClangBuiltin<"__builtin_ia32_prefetchrs">,
Intrinsic<[], [llvm_ptr_ty], []>;
+
+//===----------------------------------------------------------------------===//
+// BMM intrinsics
+
+def int_x86_avx512_vbmacor_v16hi :
+ DefaultAttrsIntrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty],
+ [IntrNoMem]>;
+def int_x86_avx512_vbmacor_v32hi :
+ DefaultAttrsIntrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty],
+ [IntrNoMem]>;
+
+def int_x86_avx512_vbmacxor_v16hi :
+ DefaultAttrsIntrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty],
+ [IntrNoMem]>;
+def int_x86_avx512_vbmacxor_v32hi :
+ DefaultAttrsIntrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty],
+ [IntrNoMem]>;
}
+//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index bc05452400458..084c1a5b05b21 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -248,6 +248,7 @@ X86_FEATURE_COMPAT(AMX_TF32, "amx-tf32", 0, 118)
X86_FEATURE_COMPAT(AMX_FP8, "amx-fp8", 0, 120)
X86_FEATURE_COMPAT(MOVRS, "movrs", 0, 121)
X86_FEATURE_COMPAT(AMX_MOVRS, "amx-movrs", 0, 122)
+X86_FEATURE_COMPAT(AVX512BMM, "avx512bmm", 0, 123)
// Features we don't multiversion on.
X86_FEATURE (NF, "nf")
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index fa41d7a7a7c5e..086a9af372d6e 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -145,6 +145,9 @@ def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
"Enable AVX-512 further Vector Byte Manipulation Instructions",
[FeatureBWI]>;
+def FeatureBMM : SubtargetFeature<"avx512bmm", "HasBMM", "true",
+ "Enable AVX512 Bit Matrix Multiply",
+ [FeatureBWI]>;
def FeatureAVXIFMA : SubtargetFeature<"avxifma", "HasAVXIFMA", "true",
"Enable AVX-IFMA",
[FeatureAVX2]>;
@@ -1636,7 +1639,8 @@ def ProcessorFeatures {
list<SubtargetFeature> ZN6AdditionalFeatures = [FeatureFP16,
FeatureAVXVNNIINT8,
FeatureAVXNECONVERT,
- FeatureAVXIFMA
+ FeatureAVXIFMA,
+ FeatureBMM
];
list<SubtargetFeature> ZN6Features =
!listconcat(ZN5Features, ZN6AdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1bf969db93ee0..2a3e423583b18 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2311,6 +2311,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
+
+ if (Subtarget.hasBMM()) {
+ for (auto VT : {MVT::v16i8, MVT::v32i8, MVT::v64i8})
+ setOperationAction(ISD::BITREVERSE, VT, Legal);
+ }
}
if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
@@ -33250,6 +33255,11 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumElts = VT.getVectorNumElements();
+ // If we have BMM, BITREVERSE on vXi8 is marked Legal and will be handled
+ // by TableGen pattern matching to VPBITREVB instruction. We should not
+ // reach here in that case.
+ assert(!Subtarget.hasBMM() && "BMM should use Legal operation action");
+
// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
if (Subtarget.hasGFNI()) {
SDValue Matrix = getGFNICtrlMask(ISD::BITREVERSE, DAG, DL, VT);
@@ -36080,6 +36090,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(POP_FROM_X87_REG)
NODE_NAME_CASE(TC_RETURN_GLOBALADDR)
NODE_NAME_CASE(CALL_GLOBALADDR)
+ NODE_NAME_CASE(VBMACOR)
+ NODE_NAME_CASE(VBMACXOR)
}
return nullptr;
#undef NODE_NAME_CASE
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index fc16053caa705..4c55cfe2c3a30 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1002,6 +1002,10 @@ namespace llvm {
AESENCWIDE256KL,
AESDECWIDE256KL,
+ // BMM Instructions
+ VBMACOR,
+ VBMACXOR,
+
/// Compare and Add if Condition is Met. Compare value in operand 2 with
/// value in memory of operand 1. If condition of operand 4 is met, add
/// value operand 3 to m32 and write new value in operand 1. Operand 2 is
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index df0d614a0251f..2f443e68bc783 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -11320,6 +11320,41 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
Sched<[sched.Folded]>;
}
+// Variant of avx512_unary_rm that requires aligned memory operands
+multiclass avx512_unary_rm_aligned<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase,
+ Sched<[sched]>;
+
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode (_.VT (bitconvert (_.AlignedLdFrag addr:$src1)))))>,
+ EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded]>;
+ }
+}
+
+multiclass avx512_unary_rm_vl_aligned<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_unary_rm_aligned<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_unary_rm_aligned<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_unary_rm_aligned<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
@@ -13764,3 +13799,29 @@ let Uses = [MXCSR] in {
defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>,
T_MAP6, XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
}
+
+// VPBITREVB - BMM bit reverse instructions
+// Basic instruction patterns for BMM (Bit Matrix Multiply)
+defm VPBITREVB : avx512_unary_rm_vl<0x81, "vbitrevb", bitreverse, SchedWriteVecALU,
+ avx512vl_i8_info, HasBMM>, T_MAP6, PS;
+
+defm : avx512_unary_lowering<"VPBITREVB", bitreverse, avx512vl_i8_info, HasBMM>;
+
+// VBMACOR, VBMACXOR - BMM matrix multiplication instructions
+// VBMACOR: EVEX.256.NP.MAP6.W0 80 /r, EVEX.512.NP.MAP6.W0 80 /r
+let Predicates = [HasBMM, HasVLX] in
+defm VBMACORZ256 : VNNI_rmb<0x80, "vbmacor16x16x16", x86vbmacor, SchedWriteVecIMul.YMM, v16i16x_info, 0>,
+ EVEX_V256, T_MAP6;
+
+let Predicates = [HasBMM] in
+defm VBMACORZ : VNNI_rmb<0x80, "vbmacor16x16x16", x86vbmacor, SchedWriteVecIMul.ZMM, v32i16_info, 0>,
+ EVEX_V512, T_MAP6;
+
+// VBMACXOR: EVEX.256.NP.MAP6.W1 80 /r, EVEX.512.NP.MAP6.W1 80 /r
+let Predicates = [HasBMM, HasVLX] in
+defm VBMACXORZ256 : VNNI_rmb<0x80, "vbmacxor16x16x16", x86vbmacxor, SchedWriteVecIMul.YMM, v16i16x_info, 0>,
+ EVEX_V256, T_MAP6, REX_W;
+
+let Predicates = [HasBMM] in
+defm VBMACXORZ : VNNI_rmb<0x80, "vbmacxor16x16x16", x86vbmacxor, SchedWriteVecIMul.ZMM, v32i16_info, 0>,
+ EVEX_V512, T_MAP6, REX_W;
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index a2b09c62af958..d8ce8eac7ec84 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -1518,3 +1518,6 @@ def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
return N->hasOneUse();
}]>;
+// BMM matrix multiplication operations
+def x86vbmacor : SDNode<"X86ISD::VBMACOR", SDTVnni>;
+def x86vbmacxor : SDNode<"X86ISD::VBMACXOR", SDTVnni>;
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 21e6bacbacee2..e9819778dbe4f 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -85,6 +85,7 @@ def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">;
def HasDQI : Predicate<"Subtarget->hasDQI()">;
def NoDQI : Predicate<"!Subtarget->hasDQI()">;
def HasBWI : Predicate<"Subtarget->hasBWI()">;
+def HasBMM : Predicate<"Subtarget->hasBMM()">;
def NoBWI : Predicate<"!Subtarget->hasBWI()">;
def HasVLX : Predicate<"Subtarget->hasVLX()">;
def NoVLX : Predicate<"!Subtarget->hasVLX()">;
@@ -175,6 +176,7 @@ def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
def HasAMXFP16 : Predicate<"Subtarget->hasAMXFP16()">;
def HasCMPCCXADD : Predicate<"Subtarget->hasCMPCCXADD()">;
def HasAVXNECONVERT : Predicate<"Subtarget->hasAVXNECONVERT()">;
+def HasAVXBMM : Predicate<"Subtarget->hasAVXBMM()">;
def HasKL : Predicate<"Subtarget->hasKL()">;
def HasRAOINT : Predicate<"Subtarget->hasRAOINT()">;
def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index c0c98c1f35491..059ea32ff7c7c 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1389,6 +1389,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_uitofp_round, INTR_TYPE_1OP, ISD::UINT_TO_FP,
X86ISD::UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_vbmacor_v16hi, INTR_TYPE_3OP, X86ISD::VBMACOR, 0),
+ X86_INTRINSIC_DATA(avx512_vbmacor_v32hi, INTR_TYPE_3OP, X86ISD::VBMACOR, 0),
+ X86_INTRINSIC_DATA(avx512_vbmacxor_v16hi, INTR_TYPE_3OP, X86ISD::VBMACXOR,
+ 0),
+ X86_INTRINSIC_DATA(avx512_vbmacxor_v32hi, INTR_TYPE_3OP, X86ISD::VBMACXOR,
+ 0),
X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index e1bdfbe42d07f..fa5c041395b4f 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -2085,6 +2085,7 @@ StringMap<bool> sys::getHostCPUFeatures() {
!getX86CpuIDAndInfo(0x80000021, &EAX, &EBX, &ECX, &EDX);
// AMD cpuid bit for prefetchi is different from Intel
Features["prefetchi"] = HasExtLeaf21 && ((EAX >> 20) & 1);
+ Features["avx512bmm"] = HasExtLeaf21 && ((EAX >> 23) & 1) && HasAVX512Save;
bool HasLeaf7 =
MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index f848b1ac08607..c06a4dc295d06 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -257,7 +257,7 @@ static constexpr FeatureBitset FeaturesZNVER5 =
static constexpr FeatureBitset FeaturesZNVER6 =
FeaturesZNVER5 | FeatureAVXVNNIINT8 | FeatureAVX512FP16 | FeatureAVXIFMA |
- FeatureAVXNECONVERT;
+ FeatureAVXNECONVERT | FeatureAVX512BMM;
// D151696 tranplanted Mangling and OnlyForCPUDispatchSpecific from
// X86TargetParser.def to here. They are assigned by following ways:
@@ -596,6 +596,7 @@ constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F;
constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW;
constexpr FeatureBitset ImpliedFeaturesAVX512BITALG = FeatureAVX512BW;
+constexpr FeatureBitset ImpliedFeaturesAVX512BMM = FeatureAVX512BW;
constexpr FeatureBitset ImpliedFeaturesAVX512IFMA = FeatureAVX512F;
constexpr FeatureBitset ImpliedFeaturesAVX512VNNI = FeatureAVX512F;
constexpr FeatureBitset ImpliedFeaturesAVX512VPOPCNTDQ = FeatureAVX512F;
diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll
new file mode 100644
index 0000000000000..54d329946b4a9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bmm,+avx512vl,+avx512bw --show-mc-encoding | FileCheck %s
+
+; Test vbitrevb instruction generation from bitreverse intrinsic
+; This test verifies that the bitreverse intrinsic generates vbitrevb instructions
+; when AVX512BMM is available. This tests code converted from C (bitrev3.c).
+
+; Test 512-bit vector bit reversal with aligned memory load
+define <64 x i8> @bitrev_zmm_aligned_load(ptr %ptr) {
+; CHECK-LABEL: bitrev_zmm_aligned_load:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <64 x i8>, ptr %ptr, align 64
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ ret <64 x i8> %1
+}
+
+; Test 256-bit with aligned memory load (AVX512VL)
+define <32 x i8> @bitrev_ymm_aligned_load(ptr %ptr) {
+; CHECK-LABEL: bitrev_ymm_aligned_load:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <32 x i8>, ptr %ptr, align 32
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ ret <32 x i8> %1
+}
+
+; Test 128-bit with aligned memory load (AVX512VL + AVX512BW)
+define <16 x i8> @bitrev_xmm_aligned_load(ptr %ptr) {
+; CHECK-LABEL: bitrev_xmm_aligned_load:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <16 x i8>, ptr %ptr, align 16
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ ret <16 x i8> %1
+}
+
+; Test 512-bit with unaligned memory load
+; Memory operand can be folded directly into vbitrevb (no alignment required)
+define <64 x i8> @bitrev_zmm_unaligned_load(ptr %ptr) {
+; CHECK-LABEL: bitrev_zmm_unaligned_load:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <64 x i8>, ptr %ptr, align 1
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ ret <64 x i8> %1
+}
+
+; Test 256-bit with unaligned memory load
+; Memory operand can be folded directly into vbitrevb (no alignment required)
+define <32 x i8> @bitrev_ymm_unaligned_load(ptr %ptr) {
+; CHECK-LABEL: bitrev_ymm_unaligned_load:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <32 x i8>, ptr %ptr, align 1
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ ret <32 x i8> %1
+}
+
+; Test 128-bit with unaligned memory load
+; Memory operand can be folded directly into vbitrevb (no alignment required)
+define <16 x i8> @bitrev_xmm_unaligned_load(ptr %ptr) {
+; CHECK-LABEL: bitrev_xmm_unaligned_load:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <16 x i8>, ptr %ptr, align 1
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ ret <16 x i8> %1
+}
+
+declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>)
+declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
+declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll
new file mode 100644
index 0000000000000..a1396bbc33ecd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bmm,+avx512vl,+avx512bw --show-mc-encoding | FileCheck %s
+
+
+define <2 x i64> @test_mm128_vbitrevb_epi8_mem(ptr %ptr) {
+; CHECK-LABEL: test_mm128_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <16 x i8>, ptr %ptr, align 16
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ %2 = bitcast <16 x i8> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm128_mask_vbitrevb_epi8_mem(<2 x i64> %src, i16 %mask, ptr %ptr) {
+; CHECK-LABEL: test_mm128_mask_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb (%rsi), %xmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x81,0x06]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <16 x i8>, ptr %ptr, align 16
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ %2 = bitcast <2 x i64> %src to <16 x i8>
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+ %5 = bitcast <16 x i8> %4 to <2 x i64>
+ ret <2 x i64> %5
+}
+
+define <2 x i64> @test_mm128_maskz_vbitrevb_epi8_mem(i16 %mask, ptr %ptr) {
+; CHECK-LABEL: test_mm128_maskz_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb (%rsi), %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x81,0x06]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <16 x i8>, ptr %ptr, align 16
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+ %4 = bitcast <16 x i8> %3 to <2 x i64>
+ ret <2 x i64> %4
+}
+
+define <4 x i64> @test_mm256_vbitrevb_epi8_mem(ptr %ptr) {
+; CHECK-LABEL: test_mm256_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <32 x i8>, ptr %ptr, align 32
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_mask_vbitrevb_epi8_mem(<4 x i64> %src, i32 %mask, ptr %ptr) {
+; CHECK-LABEL: test_mm256_mask_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb (%rsi), %ymm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x81,0x06]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <32 x i8>, ptr %ptr, align 32
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ %2 = bitcast <4 x i64> %src to <32 x i8>
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x i8> %1, <32 x i8> %2
+ %5 = bitcast <32 x i8> %4 to <4 x i64>
+ ret <4 x i64> %5
+}
+
+define <4 x i64> @test_mm256_maskz_vbitrevb_epi8_mem(i32 %mask, ptr %ptr) {
+; CHECK-LABEL: test_mm256_maskz_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb (%rsi), %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x81,0x06]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <32 x i8>, ptr %ptr, align 32
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ %2 = bitcast i32 %mask to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+ %4 = bitcast <32 x i8> %3 to <4 x i64>
+ ret <4 x i64> %4
+}
+
+define <8 x i64> @test_mm512_vbitrevb_epi8_mem(ptr %ptr) {
+; CHECK-LABEL: test_mm512_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb (%rdi), %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <64 x i8>, ptr %ptr, align 64
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ %2 = bitcast <64 x i8> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_mask_vbitrevb_epi8_mem(<8 x i64> %src, i64 %mask, ptr %ptr) {
+; CHECK-LABEL: test_mm512_mask_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb (%rsi), %zmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x81,0x06]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <64 x i8>, ptr %ptr, align 64
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ %2 = bitcast <8 x i64> %src to <64 x i8>
+ %3 = bitcast i64 %mask to <64 x i1>
+ %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2
+ %5 = bitcast <64 x i8> %4 to <8 x i64>
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @test_mm512_maskz_vbitrevb_epi8_mem(i64 %mask, ptr %ptr) {
+; CHECK-LABEL: test_mm512_maskz_vbitrevb_epi8_mem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb (%rsi), %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xc9,0x81,0x06]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = load <64 x i8>, ptr %ptr, align 64
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ %2 = bitcast i64 %mask to <64 x i1>
+ %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+ %4 = bitcast <64 x i8> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
+
+declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
+
+declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>)
+
+
+
diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll
new file mode 100644
index 0000000000000..ad1f000792cc1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bmm,+avx512vl --show-mc-encoding | FileCheck %s
+
+define <2 x i64> @test_mm128_vbitrev_epi8(<2 x i64> %a) {
+; CHECK-LABEL: test_mm128_vbitrev_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <2 x i64> %a to <16 x i8>
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ %2 = bitcast <16 x i8> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_vbitrev_epi8(<4 x i64> %a) {
+; CHECK-LABEL: test_mm256_vbitrev_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <4 x i64> %a to <32 x i8>
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i64> @test_mm512_vbitrev_epi8(<8 x i64> %a) {
+; CHECK-LABEL: test_mm512_vbitrev_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbitrevb %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <8 x i64> %a to <64 x i8>
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ %2 = bitcast <64 x i8> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <4 x float> @test_mm128_mask_vbitrevb_epi8(<2 x i64> %a, i64 %mask, <2 x i64> %b) {
+; CHECK-LABEL: test_mm128_mask_vbitrevb_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x81,0xc1]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %conv = trunc i64 %mask to i16
+ %0 = bitcast <2 x i64> %b to <16 x i8>
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ %2 = bitcast <2 x i64> %a to <16 x i8>
+ %3 = bitcast i16 %conv to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+ %5 = bitcast <16 x i8> %4 to <4 x float>
+ ret <4 x float> %5
+}
+
+define <8 x float> @test_mm256_mask_vbitrevb_epi8(<4 x i64> %a, i64 %mask, <4 x i64> %b) {
+; CHECK-LABEL: test_mm256_mask_vbitrevb_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x81,0xc1]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %conv = trunc i64 %mask to i32
+ %0 = bitcast <4 x i64> %b to <32 x i8>
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ %2 = bitcast <4 x i64> %a to <32 x i8>
+ %3 = bitcast i32 %conv to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x i8> %1, <32 x i8> %2
+ %5 = bitcast <32 x i8> %4 to <8 x float>
+ ret <8 x float> %5
+}
+
+define <8 x i64> @test_mm512_mask_vbitrevb_epi8(<8 x i64> %a, i64 %mask, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_mask_vbitrevb_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x81,0xc1]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <8 x i64> %b to <64 x i8>
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ %2 = bitcast <8 x i64> %a to <64 x i8>
+ %3 = bitcast i64 %mask to <64 x i1>
+ %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2
+ %5 = bitcast <64 x i8> %4 to <8 x i64>
+ ret <8 x i64> %5
+}
+
+define <4 x float> @test_mm128_maskz_vbitrevb_epi8(i64 %mask, <2 x i64> %b) {
+; CHECK-LABEL: test_mm128_maskz_vbitrevb_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x81,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %conv = trunc i64 %mask to i16
+ %0 = bitcast <2 x i64> %b to <16 x i8>
+ %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0)
+ %2 = bitcast i16 %conv to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+ %4 = bitcast <16 x i8> %3 to <4 x float>
+ ret <4 x float> %4
+}
+
+define <8 x float> @test_mm256_maskz_vbitrevb_epi8(i64 %mask, <4 x i64> %b) {
+; CHECK-LABEL: test_mm256_maskz_vbitrevb_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x81,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %conv = trunc i64 %mask to i32
+ %0 = bitcast <4 x i64> %b to <32 x i8>
+ %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0)
+ %2 = bitcast i32 %conv to <32 x i1>
+ %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+ %4 = bitcast <32 x i8> %3 to <8 x float>
+ ret <8 x float> %4
+}
+
+define <8 x i64> @test_mm512_maskz_vbitrevb_epi8(i64 %mask, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_maskz_vbitrevb_epi8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vbitrevb %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xc9,0x81,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <8 x i64> %b to <64 x i8>
+ %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0)
+ %2 = bitcast i64 %mask to <64 x i1>
+ %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+ %4 = bitcast <64 x i8> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
+declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
+declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>)
diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll
new file mode 100644
index 0000000000000..231ef1a5a351d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bmm,+avx512vl --show-mc-encoding | FileCheck %s
+
+define <4 x i64> @test_mm256_vbmacor(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) {
+; CHECK-LABEL: test_mm256_vbmacor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbmacor16x16x16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0x80,0xc2]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <4 x i64> %a to <16 x i16>
+ %1 = bitcast <4 x i64> %b to <16 x i16>
+ %2 = bitcast <4 x i64> %c to <16 x i16>
+ %3 = tail call <16 x i16> @llvm.x86.avx512.vbmacor.v16hi(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
+ %4 = bitcast <16 x i16> %3 to <4 x i64>
+ ret <4 x i64> %4
+}
+
+define <4 x i64> @test_mm256_vbmacxor(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) {
+; CHECK-LABEL: test_mm256_vbmacxor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbmacxor16x16x16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0xf4,0x28,0x80,0xc2]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <4 x i64> %a to <16 x i16>
+ %1 = bitcast <4 x i64> %b to <16 x i16>
+ %2 = bitcast <4 x i64> %c to <16 x i16>
+ %3 = tail call <16 x i16> @llvm.x86.avx512.vbmacxor.v16hi(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
+ %4 = bitcast <16 x i16> %3 to <4 x i64>
+ ret <4 x i64> %4
+}
+
+define <8 x i64> @test_mm512_vbmacor(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
+; CHECK-LABEL: test_mm512_vbmacor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbmacor16x16x16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0x80,0xc2]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <8 x i64> %a to <32 x i16>
+ %1 = bitcast <8 x i64> %b to <32 x i16>
+ %2 = bitcast <8 x i64> %c to <32 x i16>
+ %3 = tail call <32 x i16> @llvm.x86.avx512.vbmacor.v32hi(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
+ %4 = bitcast <32 x i16> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+define <8 x i64> @test_mm512_vbmacxor(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
+; CHECK-LABEL: test_mm512_vbmacxor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vbmacxor16x16x16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0xf4,0x48,0x80,0xc2]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %0 = bitcast <8 x i64> %a to <32 x i16>
+ %1 = bitcast <8 x i64> %b to <32 x i16>
+ %2 = bitcast <8 x i64> %c to <32 x i16>
+ %3 = tail call <32 x i16> @llvm.x86.avx512.vbmacxor.v32hi(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
+ %4 = bitcast <32 x i16> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+declare <16 x i16> @llvm.x86.avx512.vbmacor.v16hi(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <16 x i16> @llvm.x86.avx512.vbmacxor.v16hi(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.x86.avx512.vbmacor.v32hi(<32 x i16>, <32 x i16>, <32 x i16>)
+declare <32 x i16> @llvm.x86.avx512.vbmacxor.v32hi(<32 x i16>, <32 x i16>, <32 x i16>)
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index bafc98a69ddae..7b64162c9ff25 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -1633,6 +1633,9 @@ static const X86FoldTableEntry Table1[] = {
{X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0},
{X86::VPABSWZrr, X86::VPABSWZrm, 0},
{X86::VPABSWrr, X86::VPABSWrm, 0},
+ {X86::VPBITREVBZ128rr, X86::VPBITREVBZ128rm, 0},
+ {X86::VPBITREVBZ256rr, X86::VPBITREVBZ256rm, 0},
+ {X86::VPBITREVBZrr, X86::VPBITREVBZrm, 0},
{X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE},
{X86::VPBROADCASTBZ128rr, X86::VPBROADCASTBZ128rm, TB_NO_REVERSE},
{X86::VPBROADCASTBZ256rr, X86::VPBROADCASTBZ256rm, TB_NO_REVERSE},
@@ -3310,6 +3313,9 @@ static const X86FoldTableEntry Table2[] = {
{X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0},
{X86::VPAVGWZrr, X86::VPAVGWZrm, 0},
{X86::VPAVGWrr, X86::VPAVGWrm, 0},
+ {X86::VPBITREVBZ128rrkz, X86::VPBITREVBZ128rmkz, 0},
+ {X86::VPBITREVBZ256rrkz, X86::VPBITREVBZ256rmkz, 0},
+ {X86::VPBITREVBZrrkz, X86::VPBITREVBZrmkz, 0},
{X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0},
{X86::VPBLENDDrri, X86::VPBLENDDrmi, 0},
{X86::VPBLENDMBZ128rr, X86::VPBLENDMBZ128rm, 0},
@@ -4266,6 +4272,10 @@ static const X86FoldTableEntry Table3[] = {
{X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmk, 0},
{X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmk, 0},
{X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmk, 0},
+ {X86::VBMACORZ256rr, X86::VBMACORZ256rm, 0},
+ {X86::VBMACORZrr, X86::VBMACORZrm, 0},
+ {X86::VBMACXORZ256rr, X86::VBMACXORZ256rm, 0},
+ {X86::VBMACXORZrr, X86::VBMACXORZrm, 0},
{X86::VBROADCASTF32X2Z256rrk, X86::VBROADCASTF32X2Z256rmk, TB_NO_REVERSE},
{X86::VBROADCASTF32X2Zrrk, X86::VBROADCASTF32X2Zrmk, TB_NO_REVERSE},
{X86::VBROADCASTI32X2Z128rrk, X86::VBROADCASTI32X2Z128rmk, TB_NO_REVERSE},
@@ -5284,6 +5294,9 @@ static const X86FoldTableEntry Table3[] = {
{X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0},
{X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0},
{X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0},
+ {X86::VPBITREVBZ128rrk, X86::VPBITREVBZ128rmk, 0},
+ {X86::VPBITREVBZ256rrk, X86::VPBITREVBZ256rmk, 0},
+ {X86::VPBITREVBZrrk, X86::VPBITREVBZrmk, 0},
{X86::VPBLENDMBZ128rrk, X86::VPBLENDMBZ128rmk, 0},
{X86::VPBLENDMBZ256rrk, X86::VPBLENDMBZ256rmk, 0},
{X86::VPBLENDMBZrrk, X86::VPBLENDMBZrmk, 0},
@@ -6110,6 +6123,14 @@ static const X86FoldTableEntry Table4[] = {
{X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0},
{X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0},
{X86::VANDPSZrrk, X86::VANDPSZrmk, 0},
+ {X86::VBMACORZ256rrk, X86::VBMACORZ256rmk, 0},
+ {X86::VBMACORZ256rrkz, X86::VBMACORZ256rmkz, 0},
+ {X86::VBMACORZrrk, X86::VBMACORZrmk, 0},
+ {X86::VBMACORZrrkz, X86::VBMACORZrmkz, 0},
+ {X86::VBMACXORZ256rrk, X86::VBMACXORZ256rmk, 0},
+ {X86::VBMACXORZ256rrkz, X86::VBMACXORZ256rmkz, 0},
+ {X86::VBMACXORZrrk, X86::VBMACXORZrmk, 0},
+ {X86::VBMACXORZrrkz, X86::VBMACXORZrmkz, 0},
{X86::VCVT2PH2BF8SZ128rrk, X86::VCVT2PH2BF8SZ128rmk, 0},
{X86::VCVT2PH2BF8SZ256rrk, X86::VCVT2PH2BF8SZ256rmk, 0},
{X86::VCVT2PH2BF8SZrrk, X86::VCVT2PH2BF8SZrmk, 0},
@@ -8674,6 +8695,10 @@ static const X86FoldTableEntry BroadcastTable3[] = {
{X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmbk, TB_BCAST_SS},
{X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmbk, TB_BCAST_SS},
{X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmbk, TB_BCAST_SS},
+ {X86::VBMACORZ256rr, X86::VBMACORZ256rmb, TB_BCAST_W},
+ {X86::VBMACORZrr, X86::VBMACORZrmb, TB_BCAST_W},
+ {X86::VBMACXORZ256rr, X86::VBMACXORZ256rmb, TB_BCAST_W},
+ {X86::VBMACXORZrr, X86::VBMACXORZrmb, TB_BCAST_W},
{X86::VCMPBF16Z128rrik, X86::VCMPBF16Z128rmbik, TB_BCAST_SH},
{X86::VCMPBF16Z256rrik, X86::VCMPBF16Z256rmbik, TB_BCAST_SH},
{X86::VCMPBF16Zrrik, X86::VCMPBF16Zrmbik, TB_BCAST_SH},
@@ -9786,6 +9811,14 @@ static const X86FoldTableEntry BroadcastTable4[] = {
{X86::VANDPSZ128rrk, X86::VANDPSZ128rmbk, TB_BCAST_SS},
{X86::VANDPSZ256rrk, X86::VANDPSZ256rmbk, TB_BCAST_SS},
{X86::VANDPSZrrk, X86::VANDPSZrmbk, TB_BCAST_SS},
+ {X86::VBMACORZ256rrk, X86::VBMACORZ256rmbk, TB_BCAST_W},
+ {X86::VBMACORZ256rrkz, X86::VBMACORZ256rmbkz, TB_BCAST_W},
+ {X86::VBMACORZrrk, X86::VBMACORZrmbk, TB_BCAST_W},
+ {X86::VBMACORZrrkz, X86::VBMACORZrmbkz, TB_BCAST_W},
+ {X86::VBMACXORZ256rrk, X86::VBMACXORZ256rmbk, TB_BCAST_W},
+ {X86::VBMACXORZ256rrkz, X86::VBMACXORZ256rmbkz, TB_BCAST_W},
+ {X86::VBMACXORZrrk, X86::VBMACXORZrmbk, TB_BCAST_W},
+ {X86::VBMACXORZrrkz, X86::VBMACXORZrmbkz, TB_BCAST_W},
{X86::VCVT2PH2BF8SZ128rrk, X86::VCVT2PH2BF8SZ128rmbk, TB_BCAST_SH},
{X86::VCVT2PH2BF8SZ256rrk, X86::VCVT2PH2BF8SZ256rmbk, TB_BCAST_SH},
{X86::VCVT2PH2BF8SZrrk, X86::VCVT2PH2BF8SZrmbk, TB_BCAST_SH},
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 3087744f694c7..c65bec65b6c66 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -162,6 +162,8 @@ copy("Headers") {
"avx512bf16intrin.h",
"avx512bitalgintrin.h",
"avx512bwintrin.h",
+ "avx512bmmintrin.h"
+ "avx512bmmvlintrin.h"
"avx512cdintrin.h",
"avx512dqintrin.h",
"avx512fintrin.h",
More information about the llvm-commits
mailing list