[clang] [llvm] [x86][AVX-VNNI] Fix VPDPWXXD Argument Types (PR #169456)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 1 07:18:55 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: BaiXilin (BaiXilin)
<details>
<summary>Changes</summary>
Fixed the argument types of the following intrinsics to match with the ISA:
- vpdpwssd_128, vpdpwssd_256, vpdpwssd_512,
- vpdpwssds_128, vpdpwssds_256, vpdpwssds_512
- vpdpwsud_128, vpdpwsud_256, vpdowsud_512
- vpdpwsuds_128, vpdpwsuds_256, vpdpwsuds_512
- vpdpwusd_128, vpdpwusd_256, vpdpwusd_512
- vpdpwusds_128, vpdpwusds_256, vpdpwusds_512
- vpdpwuud_128, vpdpwuud_256, vpdpwuud_512
- vpdpwuuds_128, vpdpwuuds_256, vpdpwuuds_512
Fixes part of #<!-- -->97271. Note that this is the last PR for the issue.
---
Patch is 360.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169456.diff
35 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsX86.td (+24-24)
- (modified) clang/lib/Headers/avx10_2_512niintrin.h (+12-12)
- (modified) clang/lib/Headers/avx512vlvnniintrin.h (+9-8)
- (modified) clang/lib/Headers/avx512vnniintrin.h (+4-4)
- (modified) clang/lib/Headers/avxvnniint16intrin.h (+61-36)
- (modified) clang/lib/Headers/avxvnniintrin.h (+8-4)
- (modified) clang/test/CodeGen/X86/avx10_2_512ni-builtins.c (+18-18)
- (modified) clang/test/CodeGen/X86/avx10_2ni-builtins.c (+24-24)
- (modified) clang/test/CodeGen/X86/avx512vlvnni-builtins.c (+12-12)
- (modified) clang/test/CodeGen/X86/avx512vnni-builtins.c (+6-6)
- (modified) clang/test/CodeGen/X86/avxvnni-builtins.c (+8-8)
- (modified) clang/test/CodeGen/X86/avxvnniint16-builtins.c (+12-12)
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+30-30)
- (modified) llvm/lib/IR/AutoUpgrade.cpp (+153-30)
- (modified) llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp (+84-18)
- (modified) llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll (+96)
- (modified) llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll (+9-9)
- (modified) llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll (+29-29)
- (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll (+32-10)
- (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll (+14-14)
- (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll (+44)
- (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll (+12-12)
- (added) llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll (+185)
- (modified) llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll (+36-36)
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll (+20-20)
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll (+12-12)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll (+199-91)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll (+252-108)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll (+36-36)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll (+36-36)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll (+18-18)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll (+18-18)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll (+12-12)
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll (+260-116)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cb08e2107f072..da34d4123628f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1132,27 +1132,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -4325,12 +4325,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
}
let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
- def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
- def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+ def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+ def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+ def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+ def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
+ def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
}
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
@@ -4338,51 +4338,51 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
- def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
- def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
- def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
- def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
- def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
- def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
- def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
- def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
- def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
- def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
- def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
}
let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
- def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
}
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index fdb57c7c9e27b..b2215b72c57bc 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
- return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
+ return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B,
+ (__v32hu)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
- return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
+ return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B,
+ (__v32hu)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
@@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
- return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
+ return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B,
+ (__v32hi)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
- return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
+ return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B,
+ (__v32hi)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
@@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
- return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
+ return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B,
+ (__v32hu)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
- return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
+ return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B,
+ (__v32hu)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index a1a0338a69e0d..4b8a199af32e5 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -80,8 +80,8 @@
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssd_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -98,8 +98,9 @@
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssds_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A), \
+ (__v16hi)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -157,8 +158,8 @@
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssd_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -175,8 +176,8 @@
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssds_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index c386923360de6..2ce88efe4a04f 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
+ (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
+ (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
index 805d249911c17..801795ac543dd 100644
--- a/clang/lib/Headers/avxvnniint16intrin.h
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -15,6 +15,7 @@
#ifndef __AVXVNNIINT16INTRIN_H
#define __AVXVNNIINT16INTRIN_H
+// clang-format off
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
@@ -45,10 +46,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpwsud_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A), \
+ (__v8hu)(__B)))
+// clang-format off
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
@@ -79,10 +82,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpwsud_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \
- (__v8si)(__B)))
+ ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A), \
+ (__v16hu)(__B)))
+// clang-format off
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
@@ -114,10 +119,13 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
#define _mm_dpwsuds_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A), \
+ (__v8hu)(__B)))
+// clang-format off
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
@@ -149,10 +157,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpwsuds_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \
- (__v8si)(__B)))
+ ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A), \
+ (__v16hu)(__B)))
+// clang-format off
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
@@ -183,10 +193,12 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
+// clang-format on
#define _mm_dpwusd_epi32(__W, __A, __B) \
- ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \
- (__v4si)(__B)))
+ ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A), \
+ (__v8hi)(__B)))
+// clang-format off
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
@@ -217,10 +229,12 @@
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
+// clang-format on
#define _mm256_dpwusd_epi32(__W, __A, __B) \
- ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \
- (_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169456
More information about the llvm-commits
mailing list