[llvm] [msan] Handle AVX Vector Neural Network Instructions (VNNI) (PR #153927)

Fri Aug 15 22:43:34 PDT 2025

https://github.com/thurstond created https://github.com/llvm/llvm-project/pull/153927

This extends the pmadd handler (recently improved in https://github.com/llvm/llvm-project/pull/153353) to three-operand intrinsics (multiply-add-accumulate), and applies it to the AVX Vector Neural Network Instructions.

Updates the tests from https://github.com/llvm/llvm-project/pull/153135

>From 54d6ae2ce4708981414f400fffb3056853e3244f Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Sat, 16 Aug 2025 05:06:59 +0000
Subject: [PATCH] [msan] Handle AVX Vector Neural Network Instructions (VNNI)

This extends the pmadd handler (recently improved in https://github.com/llvm/llvm-project/pull/153353) to three-operand intrinsics (multiply-add-accumulate), and applies it to the AVX Vector Neural Network Instructions.

Updates the tests from https://github.com/llvm/llvm-project/pull/153135
---
 .../Instrumentation/MemorySanitizer.cpp       | 183 +++++-
 .../X86/avx10_2_512ni-intrinsics.ll           |  86 ++-
 .../X86/avx10_2ni-intrinsics.ll               | 122 ++--
 .../X86/avx512vl_vnni-intrinsics-upgrade.ll   | 546 +++++++++++++++---
 .../X86/avx512vl_vnni-intrinsics.ll           | 546 +++++++++++++++---
 .../X86/avx512vnni-intrinsics-upgrade.ll      | 274 +++++++--
 .../X86/avx512vnni-intrinsics.ll              | 274 +++++++--
 .../X86/avx_vnni-intrinsics.ll                | 194 +++++--
 .../X86/avxvnniint8-intrinsics.ll             | 198 +++++--
 9 files changed, 2066 insertions(+), 357 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 6b394f5338687..f3a7fc1e692b8 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3846,7 +3846,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Instrument multiply-add intrinsics.
+  // Instrument multiply-add(-accumulate)? intrinsics.
   //
   // e.g., Two operands:
   //         <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
@@ -3854,7 +3854,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //       Two operands which require an EltSizeInBits override:
   //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
   //
-  //       Three operands are not implemented yet:
+  //       Three operands:
   //         <4 x i32> @llvm.x86.avx512.vpdpbusd.128
   //                       (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
   //         (the result of multiply-add'ing %a and %b is accumulated with %s)
@@ -3866,22 +3866,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         cast<FixedVectorType>(I.getType());
     assert(isa<FixedVectorType>(ReturnType));
 
-    assert(I.arg_size() == 2);
-
     // Vectors A and B, and shadows
-    Value *Va = I.getOperand(0);
-    Value *Vb = I.getOperand(1);
+    Value *Va = nullptr;
+    Value *Vb = nullptr;
+    Value *Sa = nullptr;
+    Value *Sb = nullptr;
 
-    Value *Sa = getShadow(&I, 0);
-    Value *Sb = getShadow(&I, 1);
+    if (I.arg_size() == 2) {
+      Va = I.getOperand(0);
+      Vb = I.getOperand(1);
+
+      Sa = getShadow(&I, 0);
+      Sb = getShadow(&I, 1);
+    } else if (I.arg_size() == 3) {
+      // Operand 0 is the accumulator. We will deal with that below.
+      Va = I.getOperand(1);
+      Vb = I.getOperand(2);
+
+      Sa = getShadow(&I, 1);
+      Sb = getShadow(&I, 2);
+    } else {
+      assert(I.arg_size() == 2 || I.arg_size() == 3);
+    }
 
-    FixedVectorType *ParamType =
-        cast<FixedVectorType>(I.getArgOperand(0)->getType());
-    assert(ParamType == I.getArgOperand(1)->getType());
+    FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType());
+    assert(ParamType == Vb->getType());
 
     assert(ParamType->getPrimitiveSizeInBits() ==
            ReturnType->getPrimitiveSizeInBits());
 
+    if (I.arg_size() == 3) {
+      assert(ParamType == ReturnType);
+      assert(ParamType == I.getArgOperand(0)->getType());
+    }
+
     FixedVectorType *ImplicitReturnType = ReturnType;
     // Step 1: instrument multiplication of corresponding vector elements
     if (EltSizeInBits) {
@@ -3944,10 +3962,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                          Constant::getNullValue(Horizontal->getType())),
         ImplicitReturnType);
 
-    // For MMX, cast it back to the required fake return type (<1 x i64>).
+    // Cast it back to the required fake return type (<1 x i64>).
     if (EltSizeInBits)
       OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
 
+    // Step 3 (if applicable): instrument accumulator
+    if (I.arg_size() == 3)
+      OutShadow = IRB.CreateOr(OutShadow, getShadow(&I, 0));
+
     setShadow(&I, OutShadow);
     setOriginForNaryOp(I);
   }
@@ -5507,6 +5529,143 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
       break;
 
+    // AVX Vector Neural Network Instructions: bytes
+    //
+    // Multiply and Add Packed Signed and Unsigned Bytes
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // Multiply and Add Unsigned and Signed Bytes With Saturation
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssd.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssds.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
+    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
+    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    case Intrinsic::x86_avx512_vpdpbusd_128:
+    case Intrinsic::x86_avx512_vpdpbusd_256:
+    case Intrinsic::x86_avx512_vpdpbusd_512:
+    case Intrinsic::x86_avx512_vpdpbusds_128:
+    case Intrinsic::x86_avx512_vpdpbusds_256:
+    case Intrinsic::x86_avx512_vpdpbusds_512:
+    case Intrinsic::x86_avx2_vpdpbssd_128:
+    case Intrinsic::x86_avx2_vpdpbssd_256:
+    case Intrinsic::x86_avx2_vpdpbssds_128:
+    case Intrinsic::x86_avx2_vpdpbssds_256:
+    case Intrinsic::x86_avx10_vpdpbssd_512:
+    case Intrinsic::x86_avx10_vpdpbssds_512:
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+      break;
+
+    // AVX Vector Neural Network Instructions: words
+    //
+    // Multiply and Add Signed Word Integers
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // Multiply and Add Signed Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
+    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
+    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512
+    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
+    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
+    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
+    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    case Intrinsic::x86_avx512_vpdpwssd_128:
+    case Intrinsic::x86_avx512_vpdpwssd_256:
+    case Intrinsic::x86_avx512_vpdpwssd_512:
+    case Intrinsic::x86_avx512_vpdpwssds_128:
+    case Intrinsic::x86_avx512_vpdpwssds_256:
+    case Intrinsic::x86_avx512_vpdpwssds_512:
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      break;
+
+      // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
+      // Precision
+      //   <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
+      //                   (<4 x float>, <8 x bfloat>, <8 x bfloat>)
+      //   <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
+      //                   (<8 x float>, <16 x bfloat>, <16 x bfloat>)
+      //   <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
+      //                   (<16 x float>, <32 x bfloat>, <32 x bfloat>)
+      // handleVectorPmaddIntrinsic() currently only handles integer types.
+
     case Intrinsic::x86_sse_cmp_ss:
     case Intrinsic::x86_sse2_cmp_sd:
     case Intrinsic::x86_sse_comieq_ss:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
index 7af8f34d403a0..298dc4b2c853a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
@@ -7,19 +7,7 @@
 ; - llvm.x86.avx10.vdpphps.512
 ; - llvm.x86.avx10.vmpsadbw.512
 ;
-; Handled heuristically:
-; - llvm.x86.avx10.vpdpbssd.512
-; - llvm.x86.avx10.vpdpbssds.512
-; - llvm.x86.avx10.vpdpbsud.512
-; - llvm.x86.avx10.vpdpbsuds.512
-; - llvm.x86.avx10.vpdpbuud.512
-; - llvm.x86.avx10.vpdpbuuds.512
-; - llvm.x86.avx10.vpdpwsud.512
-; - llvm.x86.avx10.vpdpwsuds.512
-; - llvm.x86.avx10.vpdpwusd.512
-; - llvm.x86.avx10.vpdpwusds.512
-; - llvm.x86.avx10.vpdpwuud.512
-; - llvm.x86.avx10.vpdpwuuds.512
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -140,8 +128,8 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbssd_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -154,8 +142,26 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <64 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <64 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <64 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <64 x i8> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <64 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <64 x i1> [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <64 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <64 x i1> [[TMP21]] to <64 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <64 x i8> [[TMP22]] to <32 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i16> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = sext <32 x i1> [[TMP24]] to <32 x i16>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <32 x i16> [[TMP25]] to i512
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i512 [[TMP26]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP27]], [[TMP4]]
 ; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
@@ -168,13 +174,31 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbssds_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP28]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <32 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <32 x i1> [[TMP20]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <32 x i16> [[TMP21]] to i512
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i512 [[TMP22]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP1]]
 ; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
@@ -196,13 +220,31 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbssd_epi32(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP29]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <32 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <32 x i1> [[TMP20]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <32 x i16> [[TMP21]] to i512
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i512 [[TMP22]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP24]]
 ; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
index 5f0b0b39da4d9..e3a26ae07ac1b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
@@ -10,31 +10,7 @@
 ; - llvm.x86.avx2.mpsadbw
 ; - llvm.x86.sse41.mpsadbw
 ;
-; Handled heuristically:
-; - llvm.x86.avx2.vpdpbssd.128
-; - llvm.x86.avx2.vpdpbssd.256
-; - llvm.x86.avx2.vpdpbssds.128
-; - llvm.x86.avx2.vpdpbssds.256
-; - llvm.x86.avx2.vpdpbsud.128
-; - llvm.x86.avx2.vpdpbsud.256
-; - llvm.x86.avx2.vpdpbsuds.128
-; - llvm.x86.avx2.vpdpbsuds.256
-; - llvm.x86.avx2.vpdpbuud.128
-; - llvm.x86.avx2.vpdpbuud.256
-; - llvm.x86.avx2.vpdpbuuds.128
-; - llvm.x86.avx2.vpdpbuuds.256
-; - llvm.x86.avx2.vpdpwsud.128
-; - llvm.x86.avx2.vpdpwsud.256
-; - llvm.x86.avx2.vpdpwsuds.128
-; - llvm.x86.avx2.vpdpwsuds.256
-; - llvm.x86.avx2.vpdpwusd.128
-; - llvm.x86.avx2.vpdpwusd.256
-; - llvm.x86.avx2.vpdpwusds.128
-; - llvm.x86.avx2.vpdpwusds.256
-; - llvm.x86.avx2.vpdpwuud.128
-; - llvm.x86.avx2.vpdpwuud.256
-; - llvm.x86.avx2.vpdpwuuds.128
-; - llvm.x86.avx2.vpdpwuuds.256
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -270,13 +246,31 @@ declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x
 define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbssd_epi32(
 ; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i32> [[__A]] to <16 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[__B]] to <16 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP28]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i8> [[TMP18]] to <8 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <8 x i1> [[TMP20]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i16> [[TMP21]] to i128
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i128 [[TMP22]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP23]], [[TMP1]]
 ; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
@@ -298,13 +292,31 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbssds_epi32(
 ; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[__A]] to <16 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[__B]] to <16 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <16 x i8> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP29]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i8> [[TMP18]] to <8 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <8 x i1> [[TMP20]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i16> [[TMP21]] to i128
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i128 [[TMP22]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP23]], [[TMP24]]
 ; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
@@ -326,13 +338,31 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbssds_epi32(
 ; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i32> [[__A]] to <32 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i32> [[__B]] to <32 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP28]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i8> [[TMP18]] to <16 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i16> [[TMP21]] to i256
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i256 [[TMP22]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP23]], [[TMP1]]
 ; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
@@ -354,13 +384,31 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbssd_epi32(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i32> [[__A]] to <32 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i32> [[__B]] to <32 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <32 x i8> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP29]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i8> [[TMP18]] to <16 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i16> [[TMP21]] to i256
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i256 [[TMP22]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP23]], [[TMP24]]
 ; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 983d5aaada652..822e546c84bca 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -5,15 +5,7 @@
 ;
 ; Handled strictly: (none)
 ;
-; Handled heuristically:
-; - llvm.x86.avx512.vpdpbusd.128
-; - llvm.x86.avx512.vpdpbusd.256
-; - llvm.x86.avx512.vpdpbusds.128
-; - llvm.x86.avx512.vpdpbusds.256
-; - llvm.x86.avx512.vpdpwssd.128
-; - llvm.x86.avx512.vpdpwssd.256
-; - llvm.x86.avx512.vpdpwssds.128
-; - llvm.x86.avx512.vpdpwssds.256
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -24,12 +16,30 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x
 define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP17]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -42,8 +52,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -58,8 +68,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <32 x i8> [[TMP62]] to <16 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <16 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <16 x i1> [[TMP64]] to <16 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -69,8 +97,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <32 x i8> [[TMP52]] to <16 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <16 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <16 x i1> [[TMP54]] to <16 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -101,12 +147,30 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x
 define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP17]] to <8 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -119,8 +183,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -135,8 +199,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <16 x i8> [[TMP62]] to <8 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <8 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <8 x i1> [[TMP64]] to <8 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -148,8 +230,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <16 x i8> [[TMP52]] to <8 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <8 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <8 x i1> [[TMP54]] to <8 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -182,12 +282,30 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x i32>, <8
 define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP17]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -200,8 +318,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -216,8 +334,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <32 x i8> [[TMP62]] to <16 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <16 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <16 x i1> [[TMP64]] to <16 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -227,8 +363,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <32 x i8> [[TMP52]] to <16 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <16 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <16 x i1> [[TMP54]] to <16 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -259,12 +413,30 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>, <4 x i32>, <4
 define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP17]] to <8 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -277,8 +449,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -293,8 +465,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <16 x i8> [[TMP62]] to <8 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <8 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <8 x i1> [[TMP64]] to <8 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -306,8 +496,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <16 x i8> [[TMP52]] to <8 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <8 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <8 x i1> [[TMP54]] to <8 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -340,12 +548,28 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x
 define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -358,8 +582,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -374,8 +598,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <16 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <16 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <16 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <16 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <16 x i1> [[TMP58]] to <16 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <16 x i16> [[TMP59]] to <8 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -385,8 +625,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <16 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <16 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <16 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <16 x i1> [[TMP49]] to <16 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i16> [[TMP50]] to <8 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -417,12 +673,28 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x
 define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <8 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -435,8 +707,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -451,8 +723,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <8 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <8 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <8 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <8 x i1> [[TMP58]] to <8 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <8 x i16> [[TMP59]] to <4 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -464,8 +752,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <8 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <8 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <8 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <8 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <8 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <8 x i1> [[TMP49]] to <8 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i16> [[TMP50]] to <4 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -499,12 +803,28 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x i32>, <8
 define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -517,8 +837,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -533,8 +853,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <16 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <16 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <16 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <16 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <16 x i1> [[TMP58]] to <16 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <16 x i16> [[TMP59]] to <8 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -544,8 +880,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <16 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <16 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <16 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <16 x i1> [[TMP49]] to <16 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i16> [[TMP50]] to <8 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -576,12 +928,28 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>, <4 x i32>, <4
 define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <8 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -594,8 +962,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -610,8 +978,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <8 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <8 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <8 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <8 x i1> [[TMP58]] to <8 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <8 x i16> [[TMP59]] to <4 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -623,8 +1007,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <8 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <8 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <8 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <8 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <8 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <8 x i1> [[TMP49]] to <8 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i16> [[TMP50]] to <4 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
index 234d68f1aaf56..38f4272ef106f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
@@ -5,15 +5,7 @@
 ;
 ; Handled strictly: (none)
 ;
-; Handled heuristically:
-; - llvm.x86.avx512.vpdpbusd.128
-; - llvm.x86.avx512.vpdpbusd.256
-; - llvm.x86.avx512.vpdpbusds.128
-; - llvm.x86.avx512.vpdpbusds.256
-; - llvm.x86.avx512.vpdpwssd.128
-; - llvm.x86.avx512.vpdpwssd.256
-; - llvm.x86.avx512.vpdpwssds.128
-; - llvm.x86.avx512.vpdpwssds.256
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -23,12 +15,30 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP17]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -41,8 +51,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -57,8 +67,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <32 x i8> [[TMP62]] to <16 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <16 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <16 x i1> [[TMP64]] to <16 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -68,8 +96,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <32 x i8> [[TMP52]] to <16 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <16 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <16 x i1> [[TMP54]] to <16 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -103,12 +149,30 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
 define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP17]] to <8 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -121,8 +185,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -137,8 +201,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <16 x i8> [[TMP62]] to <8 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <8 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <8 x i1> [[TMP64]] to <8 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -150,8 +232,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <16 x i8> [[TMP52]] to <8 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <8 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <8 x i1> [[TMP54]] to <8 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -189,12 +289,30 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>
 define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP17]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -207,8 +325,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -223,8 +341,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <32 x i8> [[TMP62]] to <16 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <16 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <16 x i1> [[TMP64]] to <16 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -234,8 +370,26 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <32 x i8> [[TMP52]] to <16 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <16 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <16 x i1> [[TMP54]] to <16 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -269,12 +423,30 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>
 define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP17]] to <8 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -287,8 +459,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -303,8 +475,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <16 x i8> [[TMP62]] to <8 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <8 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <8 x i1> [[TMP64]] to <8 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -316,8 +506,26 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <16 x i8> [[TMP52]] to <8 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <8 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <8 x i1> [[TMP54]] to <8 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -355,12 +563,28 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -373,8 +597,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -389,8 +613,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <16 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <16 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <16 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <16 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <16 x i1> [[TMP58]] to <16 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <16 x i16> [[TMP59]] to <8 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -400,8 +640,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <16 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <16 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <16 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <16 x i1> [[TMP49]] to <16 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i16> [[TMP50]] to <8 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -435,12 +691,28 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
 define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <8 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -453,8 +725,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -469,8 +741,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <8 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <8 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <8 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <8 x i1> [[TMP58]] to <8 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <8 x i16> [[TMP59]] to <4 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -482,8 +770,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <8 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <8 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <8 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <8 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <8 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <8 x i1> [[TMP49]] to <8 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i16> [[TMP50]] to <4 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -521,12 +825,28 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>
 define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
@@ -539,8 +859,8 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -555,8 +875,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <16 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <16 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <16 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <16 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <16 x i1> [[TMP58]] to <16 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <16 x i16> [[TMP59]] to <8 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -566,8 +902,24 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <16 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <16 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <16 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <16 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <16 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <16 x i1> [[TMP49]] to <16 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i16> [[TMP50]] to <8 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -602,8 +954,8 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -616,8 +968,24 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i16> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i1> [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i16> [[TMP22]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP25]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
@@ -631,8 +999,8 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -647,8 +1015,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <8 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <8 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <8 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <8 x i1> [[TMP58]] to <8 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <8 x i16> [[TMP59]] to <4 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -660,8 +1044,24 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <8 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <8 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <8 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <8 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <8 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <8 x i1> [[TMP49]] to <8 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i16> [[TMP50]] to <4 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
index 77306202dc4fe..f146823b90e03 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
@@ -5,11 +5,7 @@
 ;
 ; Handled strictly: (none)
 ;
-; Handled heuristically:
-; - llvm.x86.avx512.vpdpbusd.512
-; - llvm.x86.avx512.vpdpbusds.512
-; - llvm.x86.avx512.vpdpwssd.512
-; - llvm.x86.avx512.vpdpwssds.512
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -20,12 +16,30 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <
 define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -38,8 +52,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -54,8 +68,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <64 x i8> [[TMP62]] to <32 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <32 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <32 x i1> [[TMP64]] to <32 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -65,8 +97,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <64 x i8> [[TMP52]] to <32 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <32 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <32 x i1> [[TMP54]] to <32 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -97,12 +147,30 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -115,8 +183,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -131,8 +199,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <64 x i8> [[TMP62]] to <32 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <32 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <32 x i1> [[TMP64]] to <32 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -142,8 +228,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <64 x i8> [[TMP52]] to <32 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <32 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <32 x i1> [[TMP54]] to <32 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -174,12 +278,28 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <
 define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i16> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -192,8 +312,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -208,8 +328,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <32 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <32 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <32 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <32 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <32 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <32 x i1> [[TMP58]] to <32 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <32 x i16> [[TMP59]] to <16 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -219,8 +355,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <32 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <32 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <32 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <32 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <32 x i1> [[TMP49]] to <32 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <32 x i16> [[TMP50]] to <16 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -251,12 +403,28 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i16> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -269,8 +437,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -285,8 +453,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <32 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <32 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <32 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <32 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <32 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <32 x i1> [[TMP58]] to <32 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <32 x i16> [[TMP59]] to <16 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -296,8 +480,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <32 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <32 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <32 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <32 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <32 x i1> [[TMP49]] to <32 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <32 x i16> [[TMP50]] to <16 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
index ca07d5905c8af..7c39ff6bb2be1 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
@@ -5,11 +5,7 @@
 ;
 ; Handled strictly: (none)
 ;
-; Handled heuristically:
-; - llvm.x86.avx512.vpdpbusd.512
-; - llvm.x86.avx512.vpdpbusds.512
-; - llvm.x86.avx512.vpdpwssd.512
-; - llvm.x86.avx512.vpdpwssds.512
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -19,12 +15,30 @@ declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i
 define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -37,8 +51,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -53,8 +67,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <64 x i8> [[TMP62]] to <32 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <32 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <32 x i1> [[TMP64]] to <32 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -64,8 +96,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <64 x i8> [[TMP52]] to <32 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <32 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <32 x i1> [[TMP54]] to <32 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -99,12 +149,30 @@ declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x
 define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -117,8 +185,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -133,8 +201,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8>
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <64 x i8> [[TMP62]] to <32 x i16>
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ne <32 x i16> [[TMP63]], zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = sext <32 x i1> [[TMP64]] to <32 x i16>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -144,8 +230,26 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8>
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8>
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8>
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <64 x i8> [[TMP52]] to <32 x i16>
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp ne <32 x i16> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = sext <32 x i1> [[TMP54]] to <32 x i16>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -179,12 +283,28 @@ declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i
 define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i16> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -197,8 +317,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -213,8 +333,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <32 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <32 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <32 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <32 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <32 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <32 x i1> [[TMP58]] to <32 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <32 x i16> [[TMP59]] to <16 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -224,8 +360,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <32 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <32 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <32 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <32 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <32 x i1> [[TMP49]] to <32 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <32 x i16> [[TMP50]] to <16 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -259,12 +411,28 @@ declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x
 define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i16> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
@@ -277,8 +445,8 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -293,8 +461,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i16> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP54:%.*]] = and <32 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <32 x i1> [[TMP35]], [[TMP34]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and <32 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <32 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = or <32 x i1> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sext <32 x i1> [[TMP58]] to <32 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <32 x i16> [[TMP59]] to <16 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
@@ -304,8 +488,24 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <32 x i16> [[TMP38]], zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = and <32 x i1> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and <32 x i1> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP47:%.*]] = and <32 x i1> [[TMP41]], [[TMP44]]
+; CHECK-NEXT:    [[TMP48:%.*]] = or <32 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or <32 x i1> [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = sext <32 x i1> [[TMP49]] to <32 x i16>
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <32 x i16> [[TMP50]] to <16 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
index 0af0a89f177ee..678faef203324 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
@@ -5,15 +5,7 @@
 ;
 ; Handled strictly: (none)
 ;
-; Handled heuristically:
-; - llvm.x86.avx512.vpdpbusd.128
-; - llvm.x86.avx512.vpdpbusd.256
-; - llvm.x86.avx512.vpdpbusds.128
-; - llvm.x86.avx512.vpdpbusds.256
-; - llvm.x86.avx512.vpdpwssd.128
-; - llvm.x86.avx512.vpdpwssd.256
-; - llvm.x86.avx512.vpdpwssds.128
-; - llvm.x86.avx512.vpdpwssds.256
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -23,12 +15,30 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP17]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
@@ -42,12 +52,30 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
 define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP17]] to <8 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
@@ -61,12 +89,30 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>
 define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP17]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
@@ -80,12 +126,30 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>
 define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP17]] to <8 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
@@ -99,12 +163,28 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
@@ -118,12 +198,28 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
 define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <8 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
@@ -137,12 +233,28 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>
 define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
@@ -156,12 +268,28 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>
 define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <8 x i16> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
index d586c314ed28c..b36d09bfb5944 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
@@ -5,19 +5,7 @@
 ;
 ; Handled strictly: (none)
 ;
-; Handled heuristically:
-; - llvm.x86.avx2.vpdpbssd.128
-; - llvm.x86.avx2.vpdpbssd.256
-; - llvm.x86.avx2.vpdpbssds.128
-; - llvm.x86.avx2.vpdpbssds.256
-; - llvm.x86.avx2.vpdpbsud.128
-; - llvm.x86.avx2.vpdpbsud.256
-; - llvm.x86.avx2.vpdpbsuds.128
-; - llvm.x86.avx2.vpdpbsuds.256
-; - llvm.x86.avx2.vpdpbuud.128
-; - llvm.x86.avx2.vpdpbuud.256
-; - llvm.x86.avx2.vpdpbuuds.128
-; - llvm.x86.avx2.vpdpbuuds.256
+; Handled heuristically: (none)
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -28,8 +16,8 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -43,11 +31,47 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP22]] to <16 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i8> [[TMP23]] to <8 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i16> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = sext <8 x i1> [[TMP25]] to <8 x i16>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i16> [[TMP26]] to i128
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i128 [[TMP27]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP28]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP34]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or <16 x i1> [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i1> [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sext <16 x i1> [[TMP43]] to <16 x i8>
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i16> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = sext <8 x i1> [[TMP46]] to <8 x i16>
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i128 [[TMP48]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP49]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
@@ -67,8 +91,8 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -82,11 +106,47 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP22]] to <16 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i8> [[TMP23]] to <8 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i16> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = sext <8 x i1> [[TMP25]] to <8 x i16>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i16> [[TMP26]] to i128
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i128 [[TMP27]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP28]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP34]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or <16 x i1> [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i1> [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sext <16 x i1> [[TMP43]] to <16 x i8>
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i16> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = sext <8 x i1> [[TMP46]] to <8 x i16>
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i128 [[TMP48]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP49]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
@@ -106,8 +166,8 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -121,11 +181,47 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <32 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <32 x i8> [[TMP23]] to <16 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i16> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = sext <16 x i1> [[TMP25]] to <16 x i16>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i16> [[TMP26]] to i256
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i256 [[TMP27]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP28]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i32> [[TMP4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP34]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or <32 x i1> [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <32 x i1> [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sext <32 x i1> [[TMP43]] to <32 x i8>
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast <32 x i8> [[TMP44]] to <16 x i16>
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i16> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = sext <16 x i1> [[TMP46]] to <16 x i16>
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i16> [[TMP47]] to i256
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i256 [[TMP48]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP49]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
@@ -145,8 +241,8 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -160,11 +256,47 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <32 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <32 x i8> [[TMP23]] to <16 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i16> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = sext <16 x i1> [[TMP25]] to <16 x i16>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i16> [[TMP26]] to i256
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i256 [[TMP27]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP28]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i32> [[TMP4]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP34]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or <32 x i1> [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <32 x i1> [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sext <32 x i1> [[TMP43]] to <32 x i8>
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast <32 x i8> [[TMP44]] to <16 x i16>
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i16> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = sext <16 x i1> [[TMP46]] to <16 x i16>
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i16> [[TMP47]] to i256
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i256 [[TMP48]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP49]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]