[clang] [llvm] [mlir] [x86][AVX-VNNI] Fix VPDPBXXD Argument Type (PR #159222)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 23:02:41 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
@llvm/pr-subscribers-backend-x86
Author: BaiXilin (BaiXilin)
<details>
<summary>Changes</summary>
Fixed intrinsic VPDP[SS,SU,UU]D[,S]_128/256/512's argument types to match with the ISA.
Fixes part of #<!-- -->97271.
---
Patch is 278.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159222.diff
17 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsX86.td (+18-18)
- (modified) clang/lib/Headers/avx10_2_512niintrin.h (+12-12)
- (modified) clang/lib/Headers/avxvnniint8intrin.h (+56-32)
- (modified) clang/test/CodeGen/X86/avxvnniint8-builtins.c (+12-12)
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+18-18)
- (modified) llvm/lib/IR/AutoUpgrade.cpp (+63-1)
- (modified) llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp (+13-1)
- (added) llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll (+291)
- (modified) llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll (+27-27)
- (modified) llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll (+36-36)
- (modified) llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll (+60-60)
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll (+60-60)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll (+178-118)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll (+216-136)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll (+408-248)
- (modified) mlir/include/mlir/Dialect/X86Vector/X86Vector.td (-5)
- (modified) mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp (-23)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index aac502091b57e..3be3a5e375ae8 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1106,51 +1106,51 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
}
let Features = "movrs", Attributes = [NoThrow, Const] in {
@@ -4276,12 +4276,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vdpphps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
- def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
+ def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
+ def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">;
+ def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">;
+ def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
+ def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
}
let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index 67679fce82296..fdb57c7c9e27b 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -64,8 +64,8 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
- return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v64qi)__A,
+ (__v64qi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -84,8 +84,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
- return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v64qi)__A,
+ (__v64qi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
@@ -104,8 +104,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
- return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v64qi)__A,
+ (__v64qu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -124,8 +124,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
- return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v64qi)__A,
+ (__v64qu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
@@ -144,8 +144,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
- return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v64qu)__A,
+ (__v64qu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -164,8 +164,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
- return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v64qu)__A,
+ (__v64qu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
index c211620c68f07..858b66b138f31 100644
--- a/clang/lib/Headers/avxvnniint8intrin.h
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -14,6 +14,7 @@
#ifndef __AVXVNNIINT8INTRIN_H
#define __AVXVNNIINT8INTRIN_H
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -44,10 +45,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpbssd_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v16qi)(__A), \
+ (__v16qi)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -78,10 +81,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpbssd_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \
- (__v8si)(__B)))
+ ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v32qi)(__A), \
+ (__v32qi)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -94,7 +99,7 @@
/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
@@ -113,10 +118,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpbssds_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v16qi)(__A), \
+ (__v16qi)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -129,7 +136,7 @@
/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
@@ -148,10 +155,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpbssds_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \
- (__v8si)(__B)))
+ ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v32qi)(__A), \
+ (__v32qi)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -163,7 +172,7 @@
/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
@@ -182,10 +191,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpbsud_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v16qi)(__A), \
+ (__v16qu)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -197,7 +208,7 @@
/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
@@ -216,10 +227,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpbsud_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \
- (__v8si)(__B)))
+ ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v32qi)(__A), \
+ (__v32qu)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -232,7 +245,7 @@
/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
@@ -251,10 +264,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpbsuds_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v16qi)(__A), \
+ (__v16qu)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -267,7 +282,7 @@
/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
@@ -286,10 +301,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpbsuds_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \
- (__v8si)(__B)))
+ ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v32qi)(__A), \
+ (__v32qu)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -301,7 +318,7 @@
/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBUUD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x unsigned char].
@@ -320,10 +337,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpbuud_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v16qu)(__A), \
+ (__v16qu)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -335,7 +354,7 @@
/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBUUD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x unsigned char].
@@ -354,10 +373,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpbuud_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \
- (__v8si)(__B)))
+ ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v32qu)(__A), \
+ (__v32qu)(__B)))
+// clang-format off
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -389,10 +410,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpbuuds_epi32(__W, __A, __B) ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/159222
More information about the llvm-commits
mailing list