[clang] [llvm] [x86][AVX-VNNI] Fix VPDPWXXD Argument Types (PR #169456)

Mon Dec 1 07:18:55 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: BaiXilin (BaiXilin)

<details>
<summary>Changes</summary>

Fixed the argument types of the following intrinsics to match with the ISA:
 - vpdpwssd_128, vpdpwssd_256, vpdpwssd_512,
 - vpdpwssds_128, vpdpwssds_256, vpdpwssds_512
 - vpdpwsud_128, vpdpwsud_256, vpdowsud_512
 - vpdpwsuds_128, vpdpwsuds_256, vpdpwsuds_512
 - vpdpwusd_128, vpdpwusd_256, vpdpwusd_512
 - vpdpwusds_128, vpdpwusds_256, vpdpwusds_512
 - vpdpwuud_128, vpdpwuud_256, vpdpwuud_512
 - vpdpwuuds_128, vpdpwuuds_256, vpdpwuuds_512

Fixes part of #97271. Note that this is the last PR for the issue.

---

Patch is 360.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169456.diff


35 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsX86.td (+24-24) 
- (modified) clang/lib/Headers/avx10_2_512niintrin.h (+12-12) 
- (modified) clang/lib/Headers/avx512vlvnniintrin.h (+9-8) 
- (modified) clang/lib/Headers/avx512vnniintrin.h (+4-4) 
- (modified) clang/lib/Headers/avxvnniint16intrin.h (+61-36) 
- (modified) clang/lib/Headers/avxvnniintrin.h (+8-4) 
- (modified) clang/test/CodeGen/X86/avx10_2_512ni-builtins.c (+18-18) 
- (modified) clang/test/CodeGen/X86/avx10_2ni-builtins.c (+24-24) 
- (modified) clang/test/CodeGen/X86/avx512vlvnni-builtins.c (+12-12) 
- (modified) clang/test/CodeGen/X86/avx512vnni-builtins.c (+6-6) 
- (modified) clang/test/CodeGen/X86/avxvnni-builtins.c (+8-8) 
- (modified) clang/test/CodeGen/X86/avxvnniint16-builtins.c (+12-12) 
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+30-30) 
- (modified) llvm/lib/IR/AutoUpgrade.cpp (+153-30) 
- (modified) llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp (+84-18) 
- (modified) llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll (+96) 
- (modified) llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll (+9-9) 
- (modified) llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll (+12-12) 
- (modified) llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll (+29-29) 
- (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll (+32-10) 
- (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll (+14-14) 
- (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll (+44) 
- (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll (+12-12) 
- (added) llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll (+185) 
- (modified) llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll (+36-36) 
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll (+20-20) 
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll (+12-12) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll (+199-91) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll (+252-108) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll (+36-36) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll (+36-36) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll (+18-18) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll (+18-18) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll (+12-12) 
- (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll (+260-116) 


``````````diff

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cb08e2107f072..da34d4123628f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1132,27 +1132,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -4325,12 +4325,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
-  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
+  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
@@ -4338,51 +4338,51 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index fdb57c7c9e27b..b2215b72c57bc 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
@@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
@@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index a1a0338a69e0d..4b8a199af32e5 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -80,8 +80,8 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssd_epi32(S, A, B)                                           \
+  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -98,8 +98,9 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssds_epi32(S, A, B)                                          \
+  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A),             \
+                                        (__v16hi)(B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -157,8 +158,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssd_epi32(S, A, B)                                              \
+  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -175,8 +176,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssds_epi32(S, A, B)                                             \
+  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index c386923360de6..2ce88efe4a04f 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
+                                             (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
+                                              (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
index 805d249911c17..801795ac543dd 100644
--- a/clang/lib/Headers/avxvnniint16intrin.h
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -15,6 +15,7 @@
 #ifndef __AVXVNNIINT16INTRIN_H
 #define __AVXVNNIINT16INTRIN_H
 
+// clang-format off
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@@ -45,10 +46,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpwsud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A),           \
+                                       (__v8hu)(__B)))
 
+// clang-format off
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@@ -79,10 +82,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A),          \
+                                       (__v16hu)(__B)))
 
+// clang-format off
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@@ -114,10 +119,13 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 #define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A),          \
+                                        (__v8hu)(__B)))
 
+// clang-format off
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@@ -149,10 +157,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A),         \
+                                        (__v16hu)(__B)))
 
+// clang-format off
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@@ -183,10 +193,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpwusd_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A),           \
+                                       (__v8hi)(__B)))
 
+// clang-format off
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@@ -217,10 +229,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
-                                       (_...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/169456