[compiler-rt] [llvm] [msan] Reland with even more improvement: Improve packed multiply-add instrumentation (PR #153353)

Wed Aug 13 13:21:18 PDT 2025

https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/153353

>From 3317cb5d590b17c69bd4739cb801da431d5724e4 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 04:54:57 +0000
Subject: [PATCH 01/10] Reapply "[msan] Improve packed multiply-add
 instrumentation" (#153343)

This reverts commit cf002847a464c004a57ca4777251b1aafc33d958 i.e.,
relands ba603b5e4d44f1a25207a2a00196471d2ba93424. It was reverted
because it was subtly wrong: multiplying an uninitialized zero should not
result in an initialized zero. This reland fixes the issue by using
instrumentation analogous to visitAnd (bitwise AND of an initialized
zero and an uninitialized value results in an initialized value), and
expands a test case.

Original commit message:
The current instrumentation has false positives: if there is a single uninitialized bit in any of the operands, the entire output is poisoned. This does not take into account that multiplying an uninitialized value with zero results in an initialized zero value.

This step allows elements that are zero to clear the corresponding shadow during the multiplication step. The horizontal add step and accumulation step (if any) are modeled using bitwise OR.

Future work can apply this improved handler to the AVX512 equivalent intrinsics (x86_avx512_pmaddw_d_512, x86_avx512_pmaddubs_w_512.) and AVX VNNI intrinsics.
---
 .../Instrumentation/MemorySanitizer.cpp       | 106 ++++++++++++++----
 .../X86/avx2-intrinsics-x86.ll                |  42 +++++--
 .../MemorySanitizer/X86/mmx-intrinsics.ll     |  60 ++++++----
 .../X86/sse2-intrinsics-x86.ll                |  14 ++-
 .../i386/avx2-intrinsics-i386.ll              |  50 ++++++---
 .../MemorySanitizer/i386/mmx-intrinsics.ll    |  60 ++++++----
 .../i386/sse2-intrinsics-i386.ll              |  16 ++-
 .../MemorySanitizer/vector_arith.ll           |  35 ++++--
 8 files changed, 280 insertions(+), 103 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 21bd4164385ab..4cef57d43f203 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3641,9 +3641,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an MMX-sized vector type.
-  Type *getMMXVectorTy(unsigned EltSizeInBits) {
-    const unsigned X86_MMXSizeInBits = 64;
+  // Get an MMX-sized (64-bit) vector type, or optionally, other sized
+  // vectors.
+  Type *getMMXVectorTy(unsigned EltSizeInBits,
+                       unsigned X86_MMXSizeInBits = 64) {
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
     return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3843,20 +3844,78 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Instrument multiply-add intrinsic.
-  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
-                                  unsigned MMXEltSizeInBits = 0) {
-    Type *ResTy =
-        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
+  // Instrument multiply-add intrinsics.
+  //
+  // e.g., Two operands:
+  //         <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
+  //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
+  //
+  //       Three operands are not implemented yet:
+  //         <4 x i32> @llvm.x86.avx512.vpdpbusd.128
+  //                       (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+  //         (the result of multiply-add'ing %a and %b is accumulated with %s)
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+                                  unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
-    auto *Shadow0 = getShadow(&I, 0);
-    auto *Shadow1 = getShadow(&I, 1);
-    Value *S = IRB.CreateOr(Shadow0, Shadow1);
-    S = IRB.CreateBitCast(S, ResTy);
-    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
-                       ResTy);
-    S = IRB.CreateBitCast(S, getShadowTy(&I));
-    setShadow(&I, S);
+
+    [[maybe_unused]] FixedVectorType *ReturnType =
+        cast<FixedVectorType>(I.getType());
+    assert(isa<FixedVectorType>(ReturnType));
+
+    assert(I.arg_size() == 2);
+
+    // Vectors A and B, and shadows
+    Value *Va = I.getOperand(0);
+    Value *Vb = I.getOperand(1);
+
+    Value *Sa = getShadow(&I, 0);
+    Value *Sb = getShadow(&I, 1);
+
+    FixedVectorType *ParamType =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType());
+    assert(ParamType == I.getArgOperand(1)->getType());
+
+    assert(ParamType->getPrimitiveSizeInBits() ==
+           ReturnType->getPrimitiveSizeInBits());
+
+    // Step 1: instrument multiplication of corresponding vector elements
+    if (EltSizeInBits) {
+      ParamType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+
+      Va = IRB.CreateBitCast(Va, ParamType);
+      Vb = IRB.CreateBitCast(Vb, ParamType);
+
+      Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType));
+      Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType));
+    } else {
+      assert(ParamType->getNumElements() ==
+             ReturnType->getNumElements() * ReductionFactor);
+    }
+
+    Value *Sab = IRB.CreateOr(Sa, Sb);
+
+    // Multiplying an uninitialized / element by zero results in an initialized
+    // element.
+    Value *Zero = Constant::getNullValue(Va->getType());
+    Value *VaNotZero = IRB.CreateICmpNE(Va, Zero);
+    Value *VbNotZero = IRB.CreateICmpNE(Vb, Zero);
+    Value *VaAndVbNotZero = IRB.CreateAnd(VaNotZero, VbNotZero);
+
+    // After multiplying e.g., <8 x i16> %a, <8 x i16> %b, we should have
+    // <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
+    Sab = IRB.CreateAnd(Sab, IRB.CreateSExt(VaAndVbNotZero, Sab->getType()));
+
+    // Step 2: instrument horizontal add
+    // e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
+    //                <16 x i8> into <4 x i8>  (reduction factor == 4)
+    Value *OutShadow = horizontalReduce(I, ReductionFactor, Sab, nullptr);
+
+    // Extend to <4 x i32>.
+    // For MMX, cast it back to <1 x i64>.
+    OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+
+    setShadow(&I, OutShadow);
     setOriginForNaryOp(I);
   }
 
@@ -5391,19 +5450,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorSadIntrinsic(I);
       break;
 
+    // Multiply and Add Packed Words
+    //   < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+
+    // Multiply and Add Packed Signed and Unsigned Bytes
+    //   < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+    //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
       break;
 
+    // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, 8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
       break;
 
+    // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, 16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
       break;
 
     case Intrinsic::x86_sse_cmp_ss:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index f916130fe53e5..ab05e10ccd3c9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -141,10 +141,16 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i16> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP10]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -678,10 +684,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i8> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -707,10 +719,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i8> [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index ac3bb56719038..a0341c67b1365 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1687,16 +1687,27 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i16> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sext <4 x i1> [[TMP31]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = and <4 x i16> [[TMP22]], [[TMP32]]
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <2 x i16> [[TMP26]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
+; CHECK-NEXT:    [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <1 x i64> [[TMP33]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP34]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3315,16 +3326,27 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = sext <8 x i1> [[TMP34]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP14]], [[TMP35]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i8> [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
+; CHECK-NEXT:    [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <1 x i64> [[TMP36]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP28]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index 8f915a59db8e5..fe1245553c116 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -763,10 +763,16 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <8 x i1> [[TMP11]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i16> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 5cc56baf0e0de..bf87027b056fa 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -149,11 +149,17 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i16> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP11]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -714,11 +720,17 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i8> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -734,7 +746,7 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
@@ -744,11 +756,17 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i8> [[TMP9]], [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index 0a3efaaea149f..e0e75a91cbd2e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1730,16 +1730,27 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i16> [[TMP11]], [[TMP22]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = sext <4 x i1> [[TMP32]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = and <4 x i16> [[TMP23]], [[TMP33]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i16> [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
+; CHECK-NEXT:    [[TMP34:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <1 x i64> [[TMP34]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP35]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3401,16 +3412,27 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = sext <8 x i1> [[TMP35]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP26]], [[TMP36]]
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i8> [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i64 [[TMP30]] to <1 x i64>
+; CHECK-NEXT:    [[TMP31:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[TMP32]] to <8 x i8>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP31]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index e771e60e2f294..5edc13b7abbc4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -800,11 +800,17 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i16> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index d614bb85d8584..21996b13a9961 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -18,9 +18,15 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i16> [[TMP2]], [[TMP11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[TMP9]] to <4 x i32>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[C]]
@@ -39,13 +45,24 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i16> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i8> [[TMP13]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x i8> [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
-; CHECK-NEXT:    store <1 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <1 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[C]]
 ;
 entry:

>From 41b09e976d337d997ea4fe488a8e385c237b2aa1 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 06:13:42 +0000
Subject: [PATCH 02/10] Change instrumentation to visitAnd-like

---
 .../Instrumentation/MemorySanitizer.cpp       | 67 +++++++++++++++----
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4cef57d43f203..d3e686c26f188 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3893,23 +3893,44 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
              ReturnType->getNumElements() * ReductionFactor);
     }
 
-    Value *Sab = IRB.CreateOr(Sa, Sb);
-
-    // Multiplying an uninitialized / element by zero results in an initialized
-    // element.
-    Value *Zero = Constant::getNullValue(Va->getType());
-    Value *VaNotZero = IRB.CreateICmpNE(Va, Zero);
-    Value *VbNotZero = IRB.CreateICmpNE(Vb, Zero);
-    Value *VaAndVbNotZero = IRB.CreateAnd(VaNotZero, VbNotZero);
-
-    // After multiplying e.g., <8 x i16> %a, <8 x i16> %b, we should have
-    // <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
-    Sab = IRB.CreateAnd(Sab, IRB.CreateSExt(VaAndVbNotZero, Sab->getType()));
+    // Multiplying an *initialized* zero by an uninitialized element results in
+    // an initialized zero element.
+    //
+    // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+    // results in an unpoisoned value. We can therefore adapt the visitAnd()
+    // instrumentation:
+    //   OutShadow =   (SaNonZero & SbNonZero)
+    //               | (VaNonZero & SbNonZero)
+    //               | (SaNonZero & VbNonZero)
+    //   where non-zero is checked on a per-element basis.
+    Value *SZero = Constant::getNullValue(Va->getType());
+    Value *VZero = Constant::getNullValue(Sa->getType());
+    Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
+    Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
+    Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
+    Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
+
+    Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+    Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+    Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+    // Each element of the vector is represented by a single bit (poisoned or
+    // not) e.g., <8 x i1>.
+    Value *ComboNonZero =
+        IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+
+    // Extend <8 x i1> to <8 x i16>.
+    // (The real pmadd intrinsic would have computed intermediate values of
+    // <8 x i32>, but that is irrelevant for our shadow purposes because we
+    // consider each element to be either fully initialized or fully
+    // uninitialized.)
+    ComboNonZero = IRB.CreateSExt(ComboNonZero, Sa->getType());
 
     // Step 2: instrument horizontal add
     // e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
     //                <16 x i8> into <4 x i8>  (reduction factor == 4)
-    Value *OutShadow = horizontalReduce(I, ReductionFactor, Sab, nullptr);
+    Value *OutShadow =
+        horizontalReduce(I, ReductionFactor, ComboNonZero, nullptr);
 
     // Extend to <4 x i32>.
     // For MMX, cast it back to <1 x i64>.
@@ -5453,14 +5474,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Multiply and Add Packed Words
     //   < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
     //   < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
-
+    //   <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
+    //
     // Multiply and Add Packed Signed and Unsigned Bytes
     //   < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
     //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
+    //   <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128
+    //                  (<8 x i16>, <8 x i16>, <4 x i32>, i8)
+    //   < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256
+    //                  (<16 x i16>, <16 x i16>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512
+    //                  (<32 x i16>, <32 x i16>, <16 x i32>, i16)
+    //   < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128
+    //                  (<16 x i8>, <16 x i8>, <8 x i16>, i8)
+    //   <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256
+    //                  (<32 x i8>, <32 x i8>, <16 x i16>, i16)
+    //   <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512
+    //                  (<64 x i8>, <64 x i8>, <32 x i16>, i32)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
+    case Intrinsic::x86_avx512_pmaddw_d_512:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
+    case Intrinsic::x86_avx512_pmaddubs_w_512:
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
       break;
 

>From f32000fa2f94d2987af98b16a6e90b368e7b0851 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 06:14:13 +0000
Subject: [PATCH 03/10] Expand test case

---
 compiler-rt/lib/msan/tests/msan_test.cpp | 27 +++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp
index d1c481483dfad..bcf048f04c5c2 100644
--- a/compiler-rt/lib/msan/tests/msan_test.cpp
+++ b/compiler-rt/lib/msan/tests/msan_test.cpp
@@ -4271,14 +4271,39 @@ TEST(VectorSadTest, sse2_psad_bw) {
 }
 
 TEST(VectorMaddTest, mmx_pmadd_wd) {
-  V4x16 a = {Poisoned<U2>(), 1, 2, 3};
+  V4x16 a = {Poisoned<U2>(0), 1, 2, 3};
   V4x16 b = {100, 101, 102, 103};
   V2x32 c = _mm_madd_pi16(a, b);
+  // Multiply step:
+  //    {Poison * 100, 1 * 101, 2 * 102, 3 * 103}
+  // == {Poison,       1 * 101, 2 * 102, 3 * 103}
+  //    Notice that for the poisoned value, we ignored the concrete zero value.
+  //
+  // Horizontal add step:
+  //    {Poison + 1 * 101, 2 * 102 + 3 * 103}
+  // == {Poison,           2 * 102 + 3 * 103}
 
   EXPECT_POISONED(c[0]);
   EXPECT_NOT_POISONED(c[1]);
 
   EXPECT_EQ((unsigned)(2 * 102 + 3 * 103), c[1]);
+
+  V4x16 d = {Poisoned<U2>(0), 1, 0, 3};
+  V4x16 e = {100, 101, Poisoned<U2>(102), 103};
+  V2x32 f = _mm_madd_pi16(a, b);
+  // Multiply step:
+  //    {Poison * 100, 1 * 101, 0 * Poison, 3 * 103}
+  // == {Poison,       1 * 101, 0         , 3 * 103}
+  //    Notice that 0 * Poison == 0.
+  //
+  // Horizontal add step:
+  //    {Poison + 1 * 101, 0 + 3 * 103}
+  // == {Poison,           3 * 103}
+
+  EXPECT_POISONED(f[0]);
+  EXPECT_NOT_POISONED(f[1]);
+
+  EXPECT_EQ((unsigned)(3 * 103), f[1]);
 }
 
 TEST(VectorCmpTest, mm_cmpneq_ps) {

>From 65f854de78d715c1d7293b2098c313f5129f0f24 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 06:26:55 +0000
Subject: [PATCH 04/10] Update test output to match new visitAnd()-like
 instrumentation

---
 .../X86/avx2-intrinsics-x86.ll                |  42 ++++---
 .../X86/avx512bw-intrinsics-upgrade.ll        | 116 ++++++++++--------
 .../X86/avx512bw-intrinsics.ll                | 116 ++++++++++--------
 .../MemorySanitizer/X86/mmx-intrinsics.ll     |  28 +++--
 .../X86/sse2-intrinsics-x86.ll                |  14 ++-
 .../i386/avx2-intrinsics-i386.ll              |  42 ++++---
 .../MemorySanitizer/i386/mmx-intrinsics.ll    |  28 +++--
 .../i386/sse2-intrinsics-i386.ll              |  14 ++-
 .../MemorySanitizer/vector_arith.ll           |  28 +++--
 9 files changed, 258 insertions(+), 170 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index ab05e10ccd3c9..26a6a3bdb5c0f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -140,12 +140,16 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i16> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP8]], [[TMP9]]
@@ -683,12 +687,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i8> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP8]], [[TMP9]]
@@ -718,12 +726,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i8> [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP17]], [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP9]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP16]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index 02df9c49a010b..23a4b952281ae 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -4930,18 +4930,22 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <64 x i1> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <64 x i1> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i1> [[TMP17]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <64 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <64 x i1> [[TMP11]] to <64 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <32 x i8> [[TMP15]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[TMP16]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
@@ -4955,22 +4959,26 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <64 x i1> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <64 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <64 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <64 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <64 x i1> [[TMP23]] to <64 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i8> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <32 x i8> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]]
@@ -4988,18 +4996,22 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i1> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i1> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i1> [[TMP17]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <16 x i16> [[TMP15]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP16]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
@@ -5013,22 +5025,26 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <32 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i16> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP18]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
index 78c272c7b2c5a..7b48829f28756 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
@@ -2204,18 +2204,22 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <64 x i1> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <64 x i1> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i1> [[TMP17]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <64 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <64 x i1> [[TMP11]] to <64 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <32 x i8> [[TMP15]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[TMP16]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1)
@@ -2229,22 +2233,26 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <64 x i1> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <64 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <64 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <64 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <64 x i1> [[TMP23]] to <64 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i8> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <32 x i8> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]]
@@ -2264,18 +2272,22 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i1> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i1> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i1> [[TMP17]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <16 x i16> [[TMP15]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP16]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1)
@@ -2289,22 +2301,26 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <32 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i16> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP18]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index a0341c67b1365..d7221cc612135 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1691,12 +1691,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
-; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i16> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = sext <4 x i1> [[TMP31]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = and <4 x i16> [[TMP22]], [[TMP32]]
+; CHECK-NEXT:    [[TMP35:%.*]] = and <4 x i1> [[TMP22]], [[TMP30]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <4 x i1> [[TMP29]], [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <4 x i1> [[TMP31]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP38]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
@@ -3330,12 +3334,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = sext <8 x i1> [[TMP34]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP14]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <8 x i1> [[TMP35]], [[TMP33]]
+; CHECK-NEXT:    [[TMP39:%.*]] = and <8 x i1> [[TMP32]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or <8 x i1> [[TMP34]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index fe1245553c116..eba94a5be92c6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -762,12 +762,16 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <8 x i1> [[TMP11]] to <8 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i16> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i1> [[TMP17]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index bf87027b056fa..51f90c0cdf028 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -149,12 +149,16 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP5]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i16> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i1> [[TMP5]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP9]], [[TMP10]]
@@ -720,12 +724,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP5]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i8> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i1> [[TMP5]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP9]], [[TMP10]]
@@ -756,12 +764,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i8> [[TMP9]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <32 x i1> [[TMP10]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP12]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index e0e75a91cbd2e..1c94dce8cfd4b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1734,12 +1734,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i16> [[TMP11]], [[TMP22]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <4 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = sext <4 x i1> [[TMP32]] to <4 x i16>
-; CHECK-NEXT:    [[TMP24:%.*]] = and <4 x i16> [[TMP23]], [[TMP33]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <4 x i1> [[TMP23]], [[TMP31]]
+; CHECK-NEXT:    [[TMP37:%.*]] = and <4 x i1> [[TMP30]], [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP32]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or <4 x i1> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <4 x i1> [[TMP39]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
@@ -3416,12 +3420,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i8> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i8> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = sext <8 x i1> [[TMP35]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP26]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <8 x i1> [[TMP26]], [[TMP34]]
+; CHECK-NEXT:    [[TMP39:%.*]] = and <8 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or <8 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index 5edc13b7abbc4..1bf99f58f2ef5 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -800,12 +800,16 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP5]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i16> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i1> [[TMP5]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP10]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 21996b13a9961..f18ebccf96a28 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -17,14 +17,18 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[A]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i1> [[TMP2]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP6]], [[TMP13]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i16> [[TMP2]], [[TMP11]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[TMP9]] to <4 x i32>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
@@ -49,12 +53,16 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i8> [[TMP13]], [[TMP17]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP17]], [[TMP15]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i1> [[TMP14]], [[TMP21]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i1> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i1> [[TMP22]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]

>From f439ae619f63ab353f095f75b0eb6804fadb05fe Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 06:39:45 +0000
Subject: [PATCH 05/10] Undo AVX512 change that sneaked in

---
 .../Instrumentation/MemorySanitizer.cpp       |  18 ---
 .../X86/avx512bw-intrinsics-upgrade.ll        | 116 ++++++++----------
 .../X86/avx512bw-intrinsics.ll                | 116 ++++++++----------
 3 files changed, 100 insertions(+), 150 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index d3e686c26f188..c001d722fd60f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5474,32 +5474,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Multiply and Add Packed Words
     //   < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
     //   < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
-    //   <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
     //
     // Multiply and Add Packed Signed and Unsigned Bytes
     //   < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
     //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
-    //   <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
-    //
-    // These intrinsics are auto-upgraded into non-masked forms:
-    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128
-    //                  (<8 x i16>, <8 x i16>, <4 x i32>, i8)
-    //   < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256
-    //                  (<16 x i16>, <16 x i16>, <8 x i32>, i8)
-    //   <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512
-    //                  (<32 x i16>, <32 x i16>, <16 x i32>, i16)
-    //   < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128
-    //                  (<16 x i8>, <16 x i8>, <8 x i16>, i8)
-    //   <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256
-    //                  (<32 x i8>, <32 x i8>, <16 x i16>, i16)
-    //   <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512
-    //                  (<64 x i8>, <64 x i8>, <32 x i16>, i32)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
-    case Intrinsic::x86_avx512_pmaddw_d_512:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
-    case Intrinsic::x86_avx512_pmaddubs_w_512:
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
       break;
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index 23a4b952281ae..02df9c49a010b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -4930,22 +4930,18 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <64 x i1> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <64 x i1> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i1> [[TMP17]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <64 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <64 x i1> [[TMP11]] to <64 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i8> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <32 x i8> [[TMP15]] to <32 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
-; CHECK-NEXT:    store <32 x i16> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
+; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
@@ -4959,26 +4955,22 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <64 x i1> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <64 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <64 x i1> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <64 x i1> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = sext <64 x i1> [[TMP23]] to <64 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
-; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i8> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <32 x i8> [[TMP17]] to <32 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]]
@@ -4996,22 +4988,18 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i1> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i1> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i1> [[TMP17]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <16 x i16> [[TMP15]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
@@ -5025,26 +5013,22 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <32 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i16>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i16> [[TMP17]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP18]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
index 7b48829f28756..78c272c7b2c5a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
@@ -2204,22 +2204,18 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <64 x i1> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <64 x i1> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i1> [[TMP17]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <64 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <64 x i1> [[TMP11]] to <64 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i8> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <32 x i8> [[TMP15]] to <32 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
-; CHECK-NEXT:    store <32 x i16> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
+; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1)
@@ -2233,26 +2229,22 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <64 x i1> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <64 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <64 x i1> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <64 x i1> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = sext <64 x i1> [[TMP23]] to <64 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <64 x i8> [[TMP24]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
-; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i8> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <32 x i8> [[TMP17]] to <32 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]]
@@ -2272,22 +2264,18 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i1> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i1> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i1> [[TMP17]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i1> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <16 x i16> [[TMP15]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1)
@@ -2301,26 +2289,22 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <32 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i16>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <32 x i16> [[TMP24]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i16> [[TMP17]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP18]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]

>From c4f9a3c5b4508584392a69816b9e78a8506a5fc0 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 07:13:40 +0000
Subject: [PATCH 06/10] Add TODO

---
 llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c001d722fd60f..4ea8a97cb569f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3931,10 +3931,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //                <16 x i8> into <4 x i8>  (reduction factor == 4)
     Value *OutShadow =
         horizontalReduce(I, ReductionFactor, ComboNonZero, nullptr);
+    // TODO: it could be faster to squash <8 x i1> into <4 x i2>, compare to
+    //       zero to get <4 x i1>, then sign-extend to <4 x i8>.
 
     // Extend to <4 x i32>.
     // For MMX, cast it back to <1 x i64>.
-    OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+    *OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
 
     setShadow(&I, OutShadow);
     setOriginForNaryOp(I);

>From ce01dafb7adb191a63756a06e70a89e8313808de Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 07:21:48 +0000
Subject: [PATCH 07/10] Fix MMX return shadow cast

---
 .../Instrumentation/MemorySanitizer.cpp       | 19 +++++++++++++++----
 .../MemorySanitizer/X86/mmx-intrinsics.ll     |  8 ++++----
 .../MemorySanitizer/i386/mmx-intrinsics.ll    |  8 ++++----
 .../MemorySanitizer/vector_arith.ll           |  4 ++--
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4ea8a97cb569f..72538714a4b82 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3931,12 +3931,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //                <16 x i8> into <4 x i8>  (reduction factor == 4)
     Value *OutShadow =
         horizontalReduce(I, ReductionFactor, ComboNonZero, nullptr);
-    // TODO: it could be faster to squash <8 x i1> into <4 x i2>, compare to
-    //       zero to get <4 x i1>, then sign-extend to <4 x i8>.
+    // TODO: it could be faster to bitcast <8 x i1> into <4 x i2>, compare to
+    //       zero to get <4 x i1>, then sign-extend to <4 x i8>. This
+    //       approximation works because we treat each element as either fully
+    //       initialized or fully uninitialized.
 
     // Extend to <4 x i32>.
-    // For MMX, cast it back to <1 x i64>.
-    *OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+    if (EltSizeInBits) {
+      FixedVectorType *ImplicitReturnType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
+      OutShadow = IRB.CreateSExt(OutShadow, ImplicitReturnType);
+    } else {
+      assert(cast<FixedVectorType>(OutShadow->getType())->getNumElements()
+             == cast<FixedVectorType>(getShadowTy(&I))->getNumElements());
+    }
+
+    // For MMX, cast it back to the required fake return type (<1 x i64>).
+    OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
 
     setShadow(&I, OutShadow);
     setOriginForNaryOp(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index d7221cc612135..27363159ae792 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1704,8 +1704,8 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <2 x i16> [[TMP26]] to i32
-; CHECK-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = sext <2 x i16> [[TMP26]] to <2 x i32>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i32> [[TMP27]] to i64
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
@@ -3347,8 +3347,8 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i8> [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP29:%.*]] = sext <4 x i8> [[TMP27]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i16> [[TMP29]] to i64
 ; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index 1c94dce8cfd4b..e7b347b87b0da 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1747,8 +1747,8 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i16> [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP28:%.*]] = sext <2 x i16> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <2 x i32> [[TMP28]] to i64
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP34:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
@@ -3433,8 +3433,8 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i8> [[TMP29]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP42:%.*]] = sext <4 x i8> [[TMP29]] to <4 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[TMP42]] to i64
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i64 [[TMP30]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP31:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[TMP32]] to <8 x i8>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index f18ebccf96a28..73ed4efc71c74 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -66,8 +66,8 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x i8> [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i8> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i16> [[TMP23]] to i64
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
 ; CHECK-NEXT:    store <1 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8

>From e8131ea3f74f9ed5a411f409defea249286c5564 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 17:58:02 +0000
Subject: [PATCH 08/10] Optimize to avoid horizontalReduce / shufflevector.

---
 .../Instrumentation/MemorySanitizer.cpp       | 41 +++++++++----------
 .../X86/avx2-intrinsics-x86.ll                | 21 ++++------
 .../MemorySanitizer/X86/mmx-intrinsics.ll     | 14 +++----
 .../X86/sse2-intrinsics-x86.ll                |  7 ++--
 .../i386/avx2-intrinsics-i386.ll              | 21 ++++------
 .../MemorySanitizer/i386/mmx-intrinsics.ll    | 14 +++----
 .../i386/sse2-intrinsics-i386.ll              |  7 ++--
 .../MemorySanitizer/vector_arith.ll           | 14 +++----
 8 files changed, 61 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 72538714a4b82..aa5cf7e779f8d 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3848,6 +3848,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //
   // e.g., Two operands:
   //         <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
+  //
+  //       Two operands which require an EltSizeInBits override:
   //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
   //
   //       Three operands are not implemented yet:
@@ -3878,8 +3880,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     assert(ParamType->getPrimitiveSizeInBits() ==
            ReturnType->getPrimitiveSizeInBits());
 
+    FixedVectorType *ImplicitReturnType = ReturnType;
     // Step 1: instrument multiplication of corresponding vector elements
     if (EltSizeInBits) {
+      ImplicitReturnType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
       ParamType = cast<FixedVectorType>(
           getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
@@ -3902,7 +3907,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //   OutShadow =   (SaNonZero & SbNonZero)
     //               | (VaNonZero & SbNonZero)
     //               | (SaNonZero & VbNonZero)
-    //   where non-zero is checked on a per-element basis.
+    //   where non-zero is checked on a per-element basis (not per bit).
     Value *SZero = Constant::getNullValue(Va->getType());
     Value *VZero = Constant::getNullValue(Sa->getType());
     Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
@@ -3916,7 +3921,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     // Each element of the vector is represented by a single bit (poisoned or
     // not) e.g., <8 x i1>.
-    Value *ComboNonZero =
+    Value *And =
         IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
 
     // Extend <8 x i1> to <8 x i16>.
@@ -3924,30 +3929,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // <8 x i32>, but that is irrelevant for our shadow purposes because we
     // consider each element to be either fully initialized or fully
     // uninitialized.)
-    ComboNonZero = IRB.CreateSExt(ComboNonZero, Sa->getType());
+    And = IRB.CreateSExt(And, Sa->getType());
 
     // Step 2: instrument horizontal add
-    // e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
-    //                <16 x i8> into <4 x i8>  (reduction factor == 4)
-    Value *OutShadow =
-        horizontalReduce(I, ReductionFactor, ComboNonZero, nullptr);
-    // TODO: it could be faster to bitcast <8 x i1> into <4 x i2>, compare to
-    //       zero to get <4 x i1>, then sign-extend to <4 x i8>. This
-    //       approximation works because we treat each element as either fully
-    //       initialized or fully uninitialized.
-
-    // Extend to <4 x i32>.
-    if (EltSizeInBits) {
-      FixedVectorType *ImplicitReturnType = cast<FixedVectorType>(
-          getMMXVectorTy(EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
-      OutShadow = IRB.CreateSExt(OutShadow, ImplicitReturnType);
-    } else {
-      assert(cast<FixedVectorType>(OutShadow->getType())->getNumElements()
-             == cast<FixedVectorType>(getShadowTy(&I))->getNumElements());
-    }
+    // We don't need bit-precise horizontalReduce because we only want to check
+    // if each pair of elements is fully zero.
+    // Cast to <4 x i32>.
+    Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType);
+
+    // Compute <4 x i1>, then extend back to <4 x i32>.
+    Value *OutShadow = IRB.CreateSExt(
+                                         IRB.CreateICmpNE(Horizontal, Constant::getNullValue(Horizontal->getType())),
+                                         ImplicitReturnType);
 
     // For MMX, cast it back to the required fake return type (<1 x i64>).
-    OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+    if (EltSizeInBits)
+        OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
 
     setShadow(&I, OutShadow);
     setOriginForNaryOp(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index 26a6a3bdb5c0f..cc07958bd9f26 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -150,10 +150,9 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP11]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP10]] to <8 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP7]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
@@ -697,10 +696,9 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP11]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP7]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
@@ -736,10 +734,9 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP16]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <32 x i8> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i16> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP24]] to <16 x i16>
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 27363159ae792..99eafc13b2bf1 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1701,10 +1701,9 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP37:%.*]] = or <4 x i1> [[TMP31]], [[TMP35]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP37]], [[TMP36]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP38]] to <4 x i16>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = sext <2 x i16> [[TMP26]] to <2 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i16> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = sext <2 x i1> [[TMP25]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i32> [[TMP27]] to i64
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
@@ -3344,10 +3343,9 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = or <8 x i1> [[TMP34]], [[TMP38]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP29:%.*]] = sext <4 x i8> [[TMP27]] to <4 x i16>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i8> [[TMP16]] to <4 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i16> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i16> [[TMP29]] to i64
 ; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index eba94a5be92c6..7048050180792 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -772,10 +772,9 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP11]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i1> [[TMP17]] to <8 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 51f90c0cdf028..9960b80f2856b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -159,10 +159,9 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP13]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i1> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP11]] to <8 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i16> [[TMP8]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
@@ -734,10 +733,9 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP13]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i1> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP21]] to <16 x i16>
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
@@ -774,10 +772,9 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP12]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <32 x i8> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i16> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP25]] to <16 x i16>
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index e7b347b87b0da..74822de4962b2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1744,10 +1744,9 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP32]], [[TMP36]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = or <4 x i1> [[TMP38]], [[TMP37]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <4 x i1> [[TMP39]] to <4 x i16>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = sext <2 x i16> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[TMP24]] to <2 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <2 x i32> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = sext <2 x i1> [[TMP26]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <2 x i32> [[TMP28]] to i64
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP34:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
@@ -3430,10 +3429,9 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = or <8 x i1> [[TMP35]], [[TMP38]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP42:%.*]] = sext <4 x i8> [[TMP29]] to <4 x i16>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i8> [[TMP16]] to <4 x i16>
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <4 x i16> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = sext <4 x i1> [[TMP28]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[TMP42]] to i64
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i64 [[TMP30]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP31:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index 1bf99f58f2ef5..3a37eafd78ecb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -810,10 +810,9 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i1> [[TMP13]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i1> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 73ed4efc71c74..d1060fb33e1bc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -27,10 +27,9 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP6]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP17]] to <4 x i32>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[C]]
@@ -63,10 +62,9 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP16]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i1> [[TMP13]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i1> [[TMP22]] to <8 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i8> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i16> [[TMP23]] to i64
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]

>From 2d68920a777255355971514d71ebe28eb8d8bade Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 18:10:04 +0000
Subject: [PATCH 09/10] Fix new test

---
 compiler-rt/lib/msan/tests/msan_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp
index bcf048f04c5c2..b0d8409d97ffc 100644
--- a/compiler-rt/lib/msan/tests/msan_test.cpp
+++ b/compiler-rt/lib/msan/tests/msan_test.cpp
@@ -4290,7 +4290,7 @@ TEST(VectorMaddTest, mmx_pmadd_wd) {
 
   V4x16 d = {Poisoned<U2>(0), 1, 0, 3};
   V4x16 e = {100, 101, Poisoned<U2>(102), 103};
-  V2x32 f = _mm_madd_pi16(a, b);
+  V2x32 f = _mm_madd_pi16(d, e);
   // Multiply step:
   //    {Poison * 100, 1 * 101, 0 * Poison, 3 * 103}
   // == {Poison,       1 * 101, 0         , 3 * 103}

>From d1ce5f6f7d156090003a9db90fdaea6827df9497 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 13 Aug 2025 18:10:20 +0000
Subject: [PATCH 10/10] clang-format

---
 .../Transforms/Instrumentation/MemorySanitizer.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index aa5cf7e779f8d..3ecace5cfe6e4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3883,8 +3883,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     FixedVectorType *ImplicitReturnType = ReturnType;
     // Step 1: instrument multiplication of corresponding vector elements
     if (EltSizeInBits) {
-      ImplicitReturnType = cast<FixedVectorType>(
-          getMMXVectorTy(EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
+      ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy(
+          EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
       ParamType = cast<FixedVectorType>(
           getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
@@ -3921,8 +3921,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     // Each element of the vector is represented by a single bit (poisoned or
     // not) e.g., <8 x i1>.
-    Value *And =
-        IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+    Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
 
     // Extend <8 x i1> to <8 x i16>.
     // (The real pmadd intrinsic would have computed intermediate values of
@@ -3939,12 +3938,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     // Compute <4 x i1>, then extend back to <4 x i32>.
     Value *OutShadow = IRB.CreateSExt(
-                                         IRB.CreateICmpNE(Horizontal, Constant::getNullValue(Horizontal->getType())),
-                                         ImplicitReturnType);
+        IRB.CreateICmpNE(Horizontal,
+                         Constant::getNullValue(Horizontal->getType())),
+        ImplicitReturnType);
 
     // For MMX, cast it back to the required fake return type (<1 x i64>).
     if (EltSizeInBits)
-        OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+      OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
 
     setShadow(&I, OutShadow);
     setOriginForNaryOp(I);