[clang] [llvm] [x86][AVX-VNNI] Fix VPDPBUSD Argument Types (PR #155194)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 26 00:51:08 PDT 2025
https://github.com/BaiXilin updated https://github.com/llvm/llvm-project/pull/155194
>From 944873d36128c76b55fb804e447ced36da43f8f5 Mon Sep 17 00:00:00 2001
From: BaiXilin <x53bai at uwaterloo.ca>
Date: Sun, 24 Aug 2025 16:53:49 -0400
Subject: [PATCH 1/2] [x86][AVX-VNNI] Fix VPDPBUSD Argument Types
Fixed intrinsic VPDPBUSD[,S]_128/256/512's argument types to match with
the ISA.
Fixes #97271
---
clang/include/clang/Basic/BuiltinsX86.td | 12 +-
clang/lib/Headers/avx512vlvnniintrin.h | 18 +-
clang/lib/Headers/avx512vnniintrin.h | 8 +-
clang/lib/Headers/avxvnniintrin.h | 12 +-
.../test/CodeGen/X86/avx512vlvnni-builtins.c | 24 +-
clang/test/CodeGen/X86/avx512vnni-builtins.c | 12 +-
clang/test/CodeGen/X86/avxvnni-builtins.c | 16 +-
llvm/include/llvm/IR/IntrinsicsX86.td | 24 +-
llvm/lib/IR/AutoUpgrade.cpp | 28 ++
.../CodeGen/X86/avx512vl_vnni-intrinsics.ll | 56 +--
.../test/CodeGen/X86/avx512vnni-intrinsics.ll | 28 +-
llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll | 24 +-
.../CodeGen/X86/stack-folding-int-avxvnni.ll | 40 +--
.../X86/avx512vl_vnni-intrinsics-upgrade.ll | 72 ++--
.../X86/avx512vl_vnni-intrinsics.ll | 320 ++++++++----------
.../X86/avx512vnni-intrinsics-upgrade.ll | 36 +-
.../X86/avx512vnni-intrinsics.ll | 156 ++++-----
.../X86/avx_vnni-intrinsics.ll | 112 +++---
18 files changed, 472 insertions(+), 526 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 527acd9ef086e..dea55794ee4ce 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1108,27 +1108,27 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
}
let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+ def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+ def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
}
let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+ def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
}
let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index d1e5cd9d6983f..3ae4c3be57542 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -41,8 +41,8 @@
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
-#define _mm256_dpbusd_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpbusd_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v32qi)(A), (__v32qi)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -61,8 +61,9 @@
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
-#define _mm256_dpbusds_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpbusds_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v32qi)(A), \
+ (__v32qi)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -117,8 +118,8 @@
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
-#define _mm_dpbusd_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpbusd_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v16qi)(A), (__v16qi)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -137,8 +138,9 @@
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
-#define _mm_dpbusds_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpbusds_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v16qi)(A), \
+ (__v16qi)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index 0fb381a12f2fd..1c8769c821fe2 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -22,8 +22,8 @@
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v64qi)__A,
+ (__v64qi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -45,8 +45,8 @@ _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
- return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
+ return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v64qi)__A,
+ (__v64qi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index b7de562b57c06..ebb72bbaf6657 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -63,7 +63,8 @@
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+ return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v32qi)__A,
+ (__v32qi)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
@@ -86,7 +87,8 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+ return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v32qi)__A,
+ (__v32qi)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -151,7 +153,8 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+ return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v16qi)__A,
+ (__v16qi)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
@@ -174,7 +177,8 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+ return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v16qi)__A,
+ (__v16qi)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
index 3de4cca1a7e23..f63b5c6e73917 100644
--- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
@@ -7,41 +7,41 @@
__m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_mask_dpbusd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_mask_dpbusd_epi32(__S, __U, __A, __B);
}
__m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_maskz_dpbusd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_maskz_dpbusd_epi32(__U, __S, __A, __B);
}
__m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpbusd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
return _mm256_dpbusd_epi32(__S, __A, __B);
}
__m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_mask_dpbusds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_mask_dpbusds_epi32(__S, __U, __A, __B);
}
__m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_maskz_dpbusds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_maskz_dpbusds_epi32(__U, __S, __A, __B);
}
__m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpbusds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
return _mm256_dpbusds_epi32(__S, __A, __B);
}
@@ -87,41 +87,41 @@ __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
__m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_dpbusd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_mask_dpbusd_epi32(__S, __U, __A, __B);
}
__m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_maskz_dpbusd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_maskz_dpbusd_epi32(__U, __S, __A, __B);
}
__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpbusd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
return _mm_dpbusd_epi32(__S, __A, __B);
}
__m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_dpbusds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_mask_dpbusds_epi32(__S, __U, __A, __B);
}
__m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_maskz_dpbusds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_maskz_dpbusds_epi32(__U, __S, __A, __B);
}
__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpbusds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
return _mm_dpbusds_epi32(__S, __A, __B);
}
diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index a0177b3ba0a2c..afe80458e37cc 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -7,41 +7,41 @@
__m512i test_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_dpbusd_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_mask_dpbusd_epi32(__S, __U, __A, __B);
}
__m512i test_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_dpbusd_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_maskz_dpbusd_epi32(__U, __S, __A, __B);
}
__m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_dpbusd_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
return _mm512_dpbusd_epi32(__S, __A, __B);
}
__m512i test_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_dpbusds_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_mask_dpbusds_epi32(__S, __U, __A, __B);
}
__m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_dpbusds_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_maskz_dpbusds_epi32(__U, __S, __A, __B);
}
__m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_dpbusds_epi32
- // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
return _mm512_dpbusds_epi32(__S, __A, __B);
}
diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
index bb28a359424c8..7948e0d57d9bf 100644
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -7,13 +7,13 @@
__m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpbusd_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
return _mm256_dpbusd_epi32(__S, __A, __B);
}
__m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpbusds_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
return _mm256_dpbusds_epi32(__S, __A, __B);
}
@@ -31,13 +31,13 @@ __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpbusd_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
return _mm_dpbusd_epi32(__S, __A, __B);
}
__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpbusds_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
return _mm_dpbusds_epi32(__S, __A, __B);
}
@@ -55,13 +55,13 @@ __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
__m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpbusd_avx_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
return _mm256_dpbusd_avx_epi32(__S, __A, __B);
}
__m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_dpbusds_avx_epi32
- // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
return _mm256_dpbusds_avx_epi32(__S, __A, __B);
}
@@ -79,13 +79,13 @@ __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
__m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpbusd_avx_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
return _mm_dpbusd_avx_epi32(__S, __A, __B);
}
__m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_dpbusds_avx_epi32
- // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
return _mm_dpbusds_avx_epi32(__S, __A, __B);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index dff5785531ada..4af9ffc52ba6b 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1867,29 +1867,29 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_vpdpbusd_128 :
ClangBuiltin<"__builtin_ia32_vpdpbusd128">,
- DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
- llvm_v4i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusd_256 :
ClangBuiltin<"__builtin_ia32_vpdpbusd256">,
- DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
- llvm_v8i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v32i8_ty,
+ llvm_v32i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusd_512 :
ClangBuiltin<"__builtin_ia32_vpdpbusd512">,
- DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
- llvm_v16i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v64i8_ty,
+ llvm_v64i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusds_128 :
ClangBuiltin<"__builtin_ia32_vpdpbusds128">,
- DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
- llvm_v4i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusds_256 :
ClangBuiltin<"__builtin_ia32_vpdpbusds256">,
- DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
- llvm_v8i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v32i8_ty,
+ llvm_v32i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusds_512 :
ClangBuiltin<"__builtin_ia32_vpdpbusds512">,
- DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
- llvm_v16i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v64i8_ty,
+ llvm_v64i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpwssd_128 :
ClangBuiltin<"__builtin_ia32_vpdpwssd128">,
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e200f3626e69d..bb5e9ab0830c8 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -4148,6 +4148,34 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2)};
+
+ // Input arguments types were incorrectly set to vectors of i32 before but
+ // they should be vectors of i8. Insert bit cast when encountering the old
+ // types
+ if (Args[1]->getType()->isVectorTy() &&
+ cast<VectorType>(Args[1]->getType())
+ ->getElementType()
+ ->isIntegerTy(32) &&
+ Args[2]->getType()->isVectorTy() &&
+ cast<VectorType>(Args[2]->getType())
+ ->getElementType()
+ ->isIntegerTy(32)) {
+ Type *NewArgType = nullptr;
+ if (VecWidth == 128)
+ NewArgType = VectorType::get(Builder.getInt8Ty(), 16, false);
+ else if (VecWidth == 256)
+ NewArgType = VectorType::get(Builder.getInt8Ty(), 32, false);
+ else if (VecWidth == 512)
+ NewArgType = VectorType::get(Builder.getInt8Ty(), 64, false);
+ else
+ llvm_unreachable("Unexpected vector bit width");
+
+ if (NewArgType) {
+ Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+ Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+ }
+ }
+
Rep = Builder.CreateIntrinsic(IID, Args);
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index 7613c9ff43e21..b8ebe2a4890a1 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -2,18 +2,18 @@
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpbusd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x50,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -33,11 +33,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; X64-NEXT: vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <32 x i8>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -45,18 +45,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpbusd_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x50,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -76,12 +76,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; X64-NEXT: vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <16 x i8>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -90,18 +90,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
ret { <4 x i32>, <4 x i32> } %res2
}
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x51,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -121,11 +121,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; X64-NEXT: vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <32 x i8>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -133,18 +133,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x51,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -164,12 +164,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; X64-NEXT: vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <16 x i8>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
index 21d0010ff6303..60d0298e057f3 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -2,18 +2,18 @@
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
-declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <64 x i8>, <64 x i8>)
-define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_ask_vpdpbusd_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpbusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -32,11 +32,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; X64-NEXT: vpdpbusd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xda]
; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <64 x i8>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -44,18 +44,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
ret { <16 x i32>, <16 x i32> } %res2
}
-declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <64 x i8>, <64 x i8>)
-define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpbusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x51,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -74,11 +74,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; X64-NEXT: vpdpbusds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xda]
; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <64 x i8>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
index a1db6e54fa796..de8b2a41bf8c8 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -4,9 +4,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2]
@@ -16,13 +16,13 @@ define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2]
@@ -32,13 +32,13 @@ define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2]
@@ -48,13 +48,13 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2]
@@ -64,7 +64,7 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %res
}
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
index 4b0f63f9a6389..cd576b19f8766 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -8,10 +8,10 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd:
@@ -125,7 +125,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
ret <8 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -135,11 +135,11 @@ define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
ret <4 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -150,11 +150,11 @@ define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
; CHECK-NEXT: {vex} vpdpbusd %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
ret <4 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -164,11 +164,11 @@ define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
ret <8 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -179,11 +179,11 @@ define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
; CHECK-NEXT: {vex} vpdpbusd %ymm1, %ymm2, %ymm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
ret <8 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -193,11 +193,11 @@ define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
ret <4 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -208,11 +208,11 @@ define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
; CHECK-NEXT: {vex} vpdpbusds %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
ret <4 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -222,11 +222,11 @@ define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
ret <8 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -237,6 +237,6 @@ define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
; CHECK-NEXT: {vex} vpdpbusds %ymm1, %ymm2, %ymm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
ret <8 x i32> %2
}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 822e546c84bca..777ca4548c71f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -20,10 +20,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
@@ -40,7 +40,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[TMP24]], <32 x i8> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
@@ -68,10 +68,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
@@ -88,7 +88,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[TMP30]], <32 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -97,10 +97,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
@@ -117,7 +117,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[TMP39]], <32 x i8> [[TMP40]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -151,10 +151,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
@@ -171,7 +171,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[TMP24]], <16 x i8> [[TMP5]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP4]]
;
@@ -199,10 +199,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
@@ -219,7 +219,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[TMP30]], <16 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -230,10 +230,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
@@ -250,7 +250,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[TMP39]], <16 x i8> [[TMP40]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -286,10 +286,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
@@ -306,7 +306,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[TMP24]], <32 x i8> [[TMP5]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
@@ -334,10 +334,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
@@ -354,7 +354,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[TMP30]], <32 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -363,10 +363,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
@@ -383,7 +383,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[TMP39]], <32 x i8> [[TMP40]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -417,10 +417,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
@@ -437,7 +437,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[TMP24]], <16 x i8> [[TMP5]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP4]]
;
@@ -465,10 +465,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
@@ -485,7 +485,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[TMP30]], <16 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -496,10 +496,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
@@ -516,7 +516,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[TMP39]], <16 x i8> [[TMP40]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
index 38f4272ef106f..244eb54067ea8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
@@ -10,26 +10,22 @@
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
+define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
@@ -39,22 +35,22 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) sanitize_memory {
; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: [[TMP40:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
@@ -62,22 +58,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
; CHECK-NEXT: unreachable
; CHECK: [[BB7]]:
-; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT: [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
-; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT: [[TMP30:%.*]] = load <32 x i8>, ptr [[TMP10]], align 32
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
-; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP31]], [[TMP36]]
+; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP35]], [[TMP32]]
; CHECK-NEXT: [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]]
; CHECK-NEXT: [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]]
; CHECK-NEXT: [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8>
@@ -87,7 +79,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -96,17 +88,13 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
-; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
-; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
-; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]]
-; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]]
-; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP39]], [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP37]], [[TMP46]]
+; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP39]], [[TMP41]]
; CHECK-NEXT: [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]]
; CHECK-NEXT: [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8>
@@ -116,7 +104,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -132,11 +120,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]]
;
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <32 x i8>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -144,26 +132,22 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
+define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
@@ -173,22 +157,22 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP4]]
;
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) sanitize_memory {
; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: [[TMP40:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -196,22 +180,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
; CHECK-NEXT: unreachable
; CHECK: [[BB7]]:
-; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT: [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
-; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT: [[TMP30:%.*]] = load <16 x i8>, ptr [[TMP10]], align 16
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
-; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP31]], [[TMP36]]
+; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP35]], [[TMP32]]
; CHECK-NEXT: [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]]
; CHECK-NEXT: [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8>
@@ -221,7 +201,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -232,17 +212,13 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
-; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
-; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]]
-; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]]
-; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP39]], [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP37]], [[TMP46]]
+; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP39]], [[TMP41]]
; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]]
; CHECK-NEXT: [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8>
@@ -252,7 +228,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -270,12 +246,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]]
;
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <16 x i8>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -284,26 +260,22 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
ret { <4 x i32>, <4 x i32> } %res2
}
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
+define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
@@ -313,22 +285,22 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) sanitize_memory {
; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: [[TMP40:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -336,22 +308,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
; CHECK-NEXT: unreachable
; CHECK: [[BB7]]:
-; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT: [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
-; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT: [[TMP30:%.*]] = load <32 x i8>, ptr [[TMP10]], align 32
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
-; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP31]], [[TMP36]]
+; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP35]], [[TMP32]]
; CHECK-NEXT: [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]]
; CHECK-NEXT: [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]]
; CHECK-NEXT: [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8>
@@ -361,7 +329,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -370,17 +338,13 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
-; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
-; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
-; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]]
-; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]]
-; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP39]], [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP37]], [[TMP46]]
+; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP39]], [[TMP41]]
; CHECK-NEXT: [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]]
; CHECK-NEXT: [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8>
@@ -390,7 +354,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -406,11 +370,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]]
;
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <32 x i8>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -418,26 +382,22 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
+define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
@@ -447,22 +407,22 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1,
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP4]]
;
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) sanitize_memory {
; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: [[TMP40:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -470,22 +430,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
; CHECK-NEXT: unreachable
; CHECK: [[BB7]]:
-; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT: [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
-; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT: [[TMP30:%.*]] = load <16 x i8>, ptr [[TMP10]], align 16
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
-; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP31]], [[TMP36]]
+; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP35]], [[TMP32]]
; CHECK-NEXT: [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]]
; CHECK-NEXT: [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8>
@@ -495,7 +451,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -506,17 +462,13 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
-; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
-; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]]
-; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]]
-; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP39]], [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP37]], [[TMP46]]
+; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP39]], [[TMP41]]
; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]]
; CHECK-NEXT: [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8>
@@ -526,7 +478,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -544,12 +496,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]]
;
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <16 x i8>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
index f146823b90e03..b64c033f8f882 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
@@ -20,10 +20,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
@@ -40,7 +40,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[TMP24]], <64 x i8> [[TMP5]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -68,10 +68,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
+; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
@@ -88,7 +88,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[TMP30]], <64 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -97,10 +97,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
@@ -117,7 +117,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[TMP39]], <64 x i8> [[TMP40]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -151,10 +151,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
@@ -171,7 +171,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[TMP24]], <64 x i8> [[TMP5]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -199,10 +199,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
+; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
@@ -219,7 +219,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[TMP30]], <64 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -228,10 +228,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
@@ -248,7 +248,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[TMP39]], <64 x i8> [[TMP40]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
index 7c39ff6bb2be1..1b8c43f699e1f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
@@ -10,26 +10,22 @@
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <64 x i8>, <64 x i8>)
-define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
+define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(
-; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <64 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8>
@@ -39,22 +35,22 @@ define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i3
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X2]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) sanitize_memory {
; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(
-; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <64 x i8> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT: [[TMP40:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
@@ -62,22 +58,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
; CHECK-NEXT: unreachable
; CHECK: [[BB7]]:
-; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64
+; CHECK-NEXT: [[TMP31:%.*]] = load <64 x i8>, ptr [[X2P]], align 64
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
-; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
-; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT: [[TMP30:%.*]] = load <64 x i8>, ptr [[TMP10]], align 64
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer
; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer
-; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP32]], [[TMP36]]
+; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP35]], [[TMP37]]
; CHECK-NEXT: [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]]
; CHECK-NEXT: [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]]
; CHECK-NEXT: [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8>
@@ -87,7 +79,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -96,17 +88,13 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
-; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
-; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
-; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]]
-; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]]
-; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <64 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP39]], [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP41]], [[TMP46]]
+; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP39]], [[TMP42]]
; CHECK-NEXT: [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]]
; CHECK-NEXT: [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8>
@@ -116,7 +104,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X4]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -132,11 +120,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]]
;
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <64 x i8>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -144,26 +132,22 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
ret { <16 x i32>, <16 x i32> } %res2
}
-declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <64 x i8>, <64 x i8>)
-define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
+define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512(
-; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[X2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <64 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8>
@@ -173,22 +157,22 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X2]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) sanitize_memory {
; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(
-; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <64 x i8> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP33:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT: [[TMP40:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -196,22 +180,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
; CHECK-NEXT: unreachable
; CHECK: [[BB7]]:
-; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64
+; CHECK-NEXT: [[TMP31:%.*]] = load <64 x i8>, ptr [[X2P]], align 64
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
-; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
-; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT: [[TMP30:%.*]] = load <64 x i8>, ptr [[TMP10]], align 64
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer
; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer
-; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP32]], [[TMP36]]
+; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP35]], [[TMP37]]
; CHECK-NEXT: [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]]
; CHECK-NEXT: [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]]
; CHECK-NEXT: [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8>
@@ -221,7 +201,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[TMP31]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -230,17 +210,13 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
-; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
-; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
-; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]]
-; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]]
-; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <64 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP39]], [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP41]], [[TMP46]]
+; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP39]], [[TMP42]]
; CHECK-NEXT: [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]]
; CHECK-NEXT: [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8>
@@ -250,7 +226,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X4]])
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -266,11 +242,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]]
;
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <64 x i8>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
index 678faef203324..3087bc4e5ad8c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
@@ -10,26 +10,22 @@
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
+define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusd_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
@@ -39,34 +35,30 @@ define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[RES]]
;
- %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
+define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusd_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
@@ -76,34 +68,30 @@ define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[RES]]
;
- %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
+define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
@@ -113,34 +101,30 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[RES]]
;
- %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
-define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
+define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
@@ -150,11 +134,11 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[RES]]
;
- %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <4 x i32> %res
}
>From 7868ed7b0a1f9711e962f09100fec4ba2c7fdc65 Mon Sep 17 00:00:00 2001
From: BaiXilin <x53bai at uwaterloo.ca>
Date: Tue, 26 Aug 2025 01:52:04 -0400
Subject: [PATCH 2/2] Fixed memory sanitizer assertion failures
---
.../Instrumentation/MemorySanitizer.cpp | 43 ++++++++++---------
1 file changed, 22 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 27292d1a66c30..a0c89c95d8633 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3860,7 +3860,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// Three operands:
// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
- // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+ // (<4 x i32> %s, <16 x i8> %a, <16 x i8> %b)
// (this is equivalent to multiply-add on %a and %b, followed by
// adding/"accumulating" %s. "Accumulation" stores the result in one
// of the source registers, but this accumulate vs. add distinction
@@ -3902,8 +3902,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ReturnType->getPrimitiveSizeInBits());
if (I.arg_size() == 3) {
- assert(ParamType == ReturnType);
- assert(ParamType == I.getArgOperand(0)->getType());
+ FixedVectorType *AccumulatorType =
+ cast<FixedVectorType>(I.getOperand(0)->getType());
+ assert(AccumulatorType == ReturnType);
}
FixedVectorType *ImplicitReturnType = ReturnType;
@@ -5621,19 +5622,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// Multiply and Add Packed Signed and Unsigned Bytes
// < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, <16 x i8>, <16 x i8>)
// < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <32 x i8>, <32 x i8>)
// <16 x i32> @llvm.x86.avx512.vpdpbusd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <64 x i8>, <64 x i8>)
//
// Multiply and Add Unsigned and Signed Bytes With Saturation
// < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, <16 x i8>, <16 x i8>)
// < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <32 x i8>, <32 x i8>)
// <16 x i32> @llvm.x86.avx512.vpdpbusds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <64 x i8>, <64 x i8>)
//
// < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
@@ -5652,30 +5653,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// These intrinsics are auto-upgraded into non-masked forms:
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
//
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
case Intrinsic::x86_avx512_vpdpbusd_128:
case Intrinsic::x86_avx512_vpdpbusd_256:
case Intrinsic::x86_avx512_vpdpbusd_512:
More information about the llvm-commits
mailing list