[clang] [llvm] [x86][AVX-VNNI] Fix VPDPWXXD Argument Types (PR #169456)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 24 21:30:41 PST 2025
https://github.com/BaiXilin created https://github.com/llvm/llvm-project/pull/169456
Fixed the argument types of the following intrinsics to match with the ISA:
- vpdpwssd_128, vpdpwssd_256, vpdpwssd_512,
- vpdpwssds_128, vpdpwssds_256, vpdpwssds_512
- more to come
>From 3735e20c2c20a426ae14c196f08468c41a347694 Mon Sep 17 00:00:00 2001
From: BaiXilin <x53bai at uwaterloo.ca>
Date: Tue, 25 Nov 2025 00:18:36 -0500
Subject: [PATCH] [x86][AVX-VNNI] Fix VPDPWXXD Argument Types
Fixed the argument types of the following intrinsics to match with the
ISA:
- vpdpwssd_128, vpdpwssd_256, vpdpwssd_512,
- vpdpwssds_128, vpdpwssds_256, vpdpwssds_512
- more to come
---
clang/include/clang/Basic/BuiltinsX86.td | 12 ++--
clang/lib/Headers/avx512vlvnniintrin.h | 17 ++---
clang/lib/Headers/avx512vnniintrin.h | 8 +--
clang/lib/Headers/avxvnniintrin.h | 12 ++--
.../test/CodeGen/X86/avx512vlvnni-builtins.c | 24 +++----
clang/test/CodeGen/X86/avx512vnni-builtins.c | 12 ++--
clang/test/CodeGen/X86/avxvnni-builtins.c | 16 ++---
llvm/include/llvm/IR/IntrinsicsX86.td | 24 +++----
llvm/lib/IR/AutoUpgrade.cpp | 72 +++++++++++++++++++
.../Instrumentation/MemorySanitizer.cpp | 36 +++++-----
.../CodeGen/X86/avx512vl_vnni-intrinsics.ll | 58 +++++++--------
.../X86/avx512vnni-intrinsics-upgrade.ll | 42 ++++++++---
.../test/CodeGen/X86/avx512vnni-intrinsics.ll | 28 ++++----
.../X86/avx_vnni-intrinsics-upgrade.ll | 44 ++++++++++++
llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll | 24 +++----
.../CodeGen/X86/stack-folding-int-avxvnni.ll | 40 +++++------
.../X86/avx512vl_vnni-intrinsics-upgrade.ll | 72 +++++++++----------
.../X86/avx512vl_vnni-intrinsics.ll | 72 +++++++++----------
.../X86/avx512vnni-intrinsics-upgrade.ll | 36 +++++-----
.../X86/avx512vnni-intrinsics.ll | 36 +++++-----
.../X86/avx_vnni-intrinsics.ll | 24 +++----
21 files changed, 426 insertions(+), 283 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cb08e2107f072..f61dd0546cc24 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1132,27 +1132,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
}
let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index a1a0338a69e0d..4b8a199af32e5 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -80,8 +80,8 @@
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssd_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -98,8 +98,9 @@
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssds_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A), \
+ (__v16hi)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -157,8 +158,8 @@
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssd_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -175,8 +176,8 @@
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssds_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index c386923360de6..2ce88efe4a04f 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
+ (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
+ (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index 3c4c44a930fe2..1d2e8c906effc 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -109,7 +109,8 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+ return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A,
+ (__v16hi)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -130,7 +131,8 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+ return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A,
+ (__v16hi)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
@@ -199,7 +201,8 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+ return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A,
+ (__v8hi)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -220,7 +223,8 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+ return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A,
+ (__v8hi)__B);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
index f63b5c6e73917..11dbd717a9f77 100644
--- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
@@ -47,41 +47,41 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
__m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_mask_dpwssd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B);
}
__m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_maskz_dpwssd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B);
}
__m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpwssd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_dpwssd_epi32(__S, __A, __B);
}
__m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_mask_dpwssds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B);
}
__m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_maskz_dpwssds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B);
}
__m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpwssds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_dpwssds_epi32(__S, __A, __B);
}
@@ -127,41 +127,41 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
__m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_dpwssd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_mask_dpwssd_epi32(__S, __U, __A, __B);
}
__m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_maskz_dpwssd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B);
}
__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpwssd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_dpwssd_epi32(__S, __A, __B);
}
__m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_dpwssds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_mask_dpwssds_epi32(__S, __U, __A, __B);
}
__m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_maskz_dpwssds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B);
}
__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpwssds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_dpwssds_epi32(__S, __A, __B);
}
diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index afe80458e37cc..6b8465206eedb 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -47,41 +47,41 @@ __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
__m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_dpwssd_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B);
}
__m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_dpwssd_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B);
}
__m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_dpwssd_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
return _mm512_dpwssd_epi32(__S, __A, __B);
}
__m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_dpwssds_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B);
}
__m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_dpwssds_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B);
}
__m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_dpwssds_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
return _mm512_dpwssds_epi32(__S, __A, __B);
}
diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
index 7948e0d57d9bf..6557a26807eb2 100644
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -19,13 +19,13 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
__m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpwssd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_dpwssd_epi32(__S, __A, __B);
}
__m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpwssds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_dpwssds_epi32(__S, __A, __B);
}
@@ -43,13 +43,13 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpwssd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_dpwssd_epi32(__S, __A, __B);
}
__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpwssds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_dpwssds_epi32(__S, __A, __B);
}
@@ -67,13 +67,13 @@ __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
__m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpwssd_avx_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_dpwssd_avx_epi32(__S, __A, __B);
}
__m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpwssds_avx_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_dpwssds_avx_epi32(__S, __A, __B);
}
@@ -91,12 +91,12 @@ __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
__m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpwssd_avx_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_dpwssd_avx_epi32(__S, __A, __B);
}
__m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpwssds_avx_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_dpwssds_avx_epi32(__S, __A, __B);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 1dd23f60c7e1e..546eed7c904a1 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1893,29 +1893,29 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_vpdpwssd_128 :
ClangBuiltin<"__builtin_ia32_vpdpwssd128">,
- DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
- llvm_v4i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+ llvm_v8i16_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpwssd_256 :
ClangBuiltin<"__builtin_ia32_vpdpwssd256">,
- DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
- llvm_v8i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+ llvm_v16i16_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpwssd_512 :
ClangBuiltin<"__builtin_ia32_vpdpwssd512">,
- DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
- llvm_v16i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+ llvm_v32i16_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpwssds_128 :
ClangBuiltin<"__builtin_ia32_vpdpwssds128">,
- DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
- llvm_v4i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+ llvm_v8i16_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpwssds_256 :
ClangBuiltin<"__builtin_ia32_vpdpwssds256">,
- DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
- llvm_v8i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+ llvm_v16i16_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpwssds_512 :
ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
- DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
- llvm_v16i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+ llvm_v32i16_ty], [IntrNoMem]>;
def int_x86_avx2_vpdpbssd_128
: ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 58b7ddd0381e5..53ae18c4591f7 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -125,6 +125,24 @@ static bool upgradeX86MultiplyAddBytes(Function *F, Intrinsic::ID IID,
return true;
}
+// Upgrade the declaration of multipy and add words intrinsics whose input
+// arguments' types have changed to vectors of i32 to vectors of i16
+static bool upgradeX86MultiplyAddWords(Function *F, Intrinsic::ID IID,
+ Function *&NewFn) {
+ // check if input argument type is a vector of i16
+ Type *Arg1Type = F->getFunctionType()->getParamType(1);
+ Type *Arg2Type = F->getFunctionType()->getParamType(2);
+ if (Arg1Type->isVectorTy() &&
+ cast<VectorType>(Arg1Type)->getElementType()->isIntegerTy(16) &&
+ Arg2Type->isVectorTy() &&
+ cast<VectorType>(Arg2Type)->getElementType()->isIntegerTy(16))
+ return false;
+
+ rename(F);
+ NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+ return true;
+}
+
static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
Function *&NewFn) {
if (F->getReturnType()->getScalarType()->isBFloatTy())
@@ -590,6 +608,19 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
.Default(Intrinsic::not_intrinsic);
if (ID != Intrinsic::not_intrinsic)
return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+ } else if (Name.starts_with("vpdpwssd.") ||
+ Name.starts_with("vpdpwssds.")) {
+ // Added in 21.1
+ ID = StringSwitch<Intrinsic::ID>(Name)
+ .Case("vpdpwssd.128", Intrinsic::x86_avx512_vpdpwssd_128)
+ .Case("vpdpwssd.256", Intrinsic::x86_avx512_vpdpwssd_256)
+ .Case("vpdpwssd.512", Intrinsic::x86_avx512_vpdpwssd_512)
+ .Case("vpdpwssds.128", Intrinsic::x86_avx512_vpdpwssds_128)
+ .Case("vpdpwssds.256", Intrinsic::x86_avx512_vpdpwssds_256)
+ .Case("vpdpwssds.512", Intrinsic::x86_avx512_vpdpwssds_512)
+ .Default(Intrinsic::not_intrinsic);
+ if (ID != Intrinsic::not_intrinsic)
+ return upgradeX86MultiplyAddWords(F, ID, NewFn);
}
return false; // No other 'x86.avx512.*'.
}
@@ -4307,6 +4338,32 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2)};
+
+ // Input arguments types were incorrectly set to vectors of i32 before but
+ // they should be vectors of i16. Insert bit cast when encountering the old
+ // types
+ if (Args[1]->getType()->isVectorTy() &&
+ cast<VectorType>(Args[1]->getType())
+ ->getElementType()
+ ->isIntegerTy(32) &&
+ Args[2]->getType()->isVectorTy() &&
+ cast<VectorType>(Args[2]->getType())
+ ->getElementType()
+ ->isIntegerTy(32)) {
+ Type *NewArgType = nullptr;
+ if (VecWidth == 128)
+ NewArgType = VectorType::get(Builder.getInt16Ty(), 8, false);
+ else if (VecWidth == 256)
+ NewArgType = VectorType::get(Builder.getInt16Ty(), 16, false);
+ else if (VecWidth == 512)
+ NewArgType = VectorType::get(Builder.getInt16Ty(), 32, false);
+ else
+ llvm_unreachable("Unexpected vector bit width");
+
+ Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+ Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+ }
+
Rep = Builder.CreateIntrinsic(IID, Args);
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
@@ -5382,6 +5439,21 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
NewCall = Builder.CreateCall(NewFn, Args);
break;
}
+ case Intrinsic::x86_avx512_vpdpwssd_128:
+ case Intrinsic::x86_avx512_vpdpwssd_256:
+ case Intrinsic::x86_avx512_vpdpwssd_512:
+ case Intrinsic::x86_avx512_vpdpwssds_128:
+ case Intrinsic::x86_avx512_vpdpwssds_256:
+ case Intrinsic::x86_avx512_vpdpwssds_512:
+ unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 16;
+ Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2)};
+ Type *NewArgType = VectorType::get(Builder.getInt16Ty(), NumElts, false);
+ Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+ Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+
+ NewCall = Builder.CreateCall(NewFn, Args);
+ break;
}
assert(NewCall && "Should have either set this variable or returned through "
"the default case");
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index ceeece41782f4..f0fe71e9eb69e 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5875,46 +5875,46 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// Multiply and Add Signed Word Integers
// < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
// < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
// <16 x i32> @llvm.x86.avx512.vpdpwssd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
//
// Multiply and Add Signed Word Integers With Saturation
// < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
// < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
// <16 x i32> @llvm.x86.avx512.vpdpwssds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
//
// These intrinsics are auto-upgraded into non-masked forms:
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
//
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
case Intrinsic::x86_avx512_vpdpwssd_128:
case Intrinsic::x86_avx512_vpdpwssd_256:
case Intrinsic::x86_avx512_vpdpwssd_512:
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index b8ebe2a4890a1..ddf0050dbd74a 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -178,18 +178,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
ret { <4 x i32>, <4 x i32> } %res2
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -209,11 +209,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; X64-NEXT: vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <16 x i16>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -221,18 +221,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -252,12 +252,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; X64-NEXT: vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <8 x i16>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -266,18 +266,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
ret { <4 x i32>, <4 x i32> } %res2
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x53,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -297,11 +297,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; X64-NEXT: vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <16 x i16>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -309,9 +309,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p) {
; X86-LABEL: test_int_x86_avx512_vpdpwssds_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -322,12 +322,12 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
; X64: # %bb.0:
; X64-NEXT: vpdpwssds (%rdi), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x53,0x07]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <8 x i16>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -347,12 +347,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; X64-NEXT: vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <8 x i16>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
index 63ff88a7fa4ae..35363e99ef4c8 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
@@ -102,11 +102,22 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x
ret { <16 x i32>, <16 x i32> } %res3
}
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_vpdpwssd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpdpwssd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
+
declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
@@ -114,8 +125,8 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
ret <16 x i32> %res
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
-; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -125,7 +136,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X86-NEXT: retl # encoding: [0xc3]
;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
; X64: # %bb.0:
; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
@@ -141,20 +152,31 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
ret { <16 x i32>, <16 x i32> } %res3
}
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
ret <16 x i32> %res
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
-; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -164,7 +186,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X86-NEXT: retl # encoding: [0xc3]
;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
; X64: # %bb.0:
; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
index 60d0298e057f3..e97b8a5c5503f 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -86,18 +86,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
ret { <16 x i32>, <16 x i32> } %res2
}
-declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <32 x i16>, <32 x i16>)
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -116,11 +116,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <32 x i16>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -128,18 +128,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
ret { <16 x i32>, <16 x i32> } %res2
}
-declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <32 x i16>, <32 x i16>)
-define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_ask_vpdpwssds_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -158,11 +158,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <32 x i16>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
index 0f4a4f27b9715..f359ecef8ceb3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
@@ -45,3 +45,47 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
%res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
ret <8 x i32> %res
}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
index de8b2a41bf8c8..5748a426c76c3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -68,9 +68,9 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <1
ret <4 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
@@ -80,13 +80,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
@@ -96,13 +96,13 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
@@ -112,13 +112,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
@@ -128,6 +128,6 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %res
}
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
index cd576b19f8766..345fa0efa42df 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -4,16 +4,16 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -23,11 +23,11 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
ret <4 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -37,11 +37,11 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
ret <4 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -51,11 +51,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
ret <8 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -65,11 +65,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
ret <8 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -79,11 +79,11 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
ret <4 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -93,11 +93,11 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
ret <4 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -107,11 +107,11 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
ret <8 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -121,7 +121,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
ret <8 x i32> %2
}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 5e937485ff282..d0daea5e68fea 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -528,10 +528,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -546,7 +546,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
@@ -574,10 +574,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -592,7 +592,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -601,10 +601,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -619,7 +619,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -653,10 +653,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -671,7 +671,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP4]]
;
@@ -699,10 +699,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -717,7 +717,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -728,10 +728,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -746,7 +746,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -783,10 +783,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -801,7 +801,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
@@ -829,10 +829,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -847,7 +847,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -856,10 +856,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -874,7 +874,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -908,10 +908,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -926,7 +926,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP4]]
;
@@ -954,10 +954,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -972,7 +972,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -983,10 +983,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -1001,7 +1001,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
index 1d3046804b74f..f2b1e16e3d5c2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
@@ -495,10 +495,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -513,7 +513,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
@@ -541,10 +541,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -559,7 +559,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -568,10 +568,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -586,7 +586,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -623,10 +623,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -641,7 +641,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP4]]
;
@@ -669,10 +669,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -687,7 +687,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -698,10 +698,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -716,7 +716,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -757,10 +757,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -775,7 +775,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
@@ -803,10 +803,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -821,7 +821,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -830,10 +830,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -848,7 +848,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -896,10 +896,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
-; CHECK-NEXT: [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <8 x i16> [[TMP11]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <8 x i16> [[TMP12]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i16> [[TMP26]], zeroinitializer
@@ -914,7 +914,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32> [[TMP23]], zeroinitializer
; CHECK-NEXT: [[TMP25:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP25]], [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP26]], <8 x i16> [[TMP10]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP9]]
;
@@ -943,10 +943,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -961,7 +961,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -972,10 +972,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -990,7 +990,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
index 5c99f8a3a1fb6..4e7598b92abcf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
@@ -270,10 +270,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -288,7 +288,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -316,10 +316,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -334,7 +334,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -343,10 +343,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -361,7 +361,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -395,10 +395,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -413,7 +413,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -441,10 +441,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -459,7 +459,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -468,10 +468,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -486,7 +486,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
index 236ff45c6cd08..0fd60149cc15e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
@@ -251,10 +251,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -269,7 +269,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -297,10 +297,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -315,7 +315,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -324,10 +324,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -342,7 +342,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -379,10 +379,10 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -397,7 +397,7 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -425,10 +425,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -443,7 +443,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -452,10 +452,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -470,7 +470,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
index 0344fbd5ee2a9..0a6c5f7b089ba 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
@@ -143,10 +143,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -161,7 +161,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[RES]]
;
@@ -178,10 +178,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -196,7 +196,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[RES]]
;
@@ -213,10 +213,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -231,7 +231,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[RES]]
;
@@ -248,10 +248,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -266,7 +266,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[RES]]
;
More information about the llvm-commits
mailing list