[llvm-branch-commits] [llvm] d7b5ea3 - Revert "[msan] Improve packed multiply-add instrumentation (#152941)"

Tue Aug 12 21:31:36 PDT 2025

Author: Thurston Dang
Date: 2025-08-12T21:31:33-07:00
New Revision: d7b5ea35082fdf50f3f07d7fc59f0e50f9af6dba

URL: https://github.com/llvm/llvm-project/commit/d7b5ea35082fdf50f3f07d7fc59f0e50f9af6dba
DIFF: https://github.com/llvm/llvm-project/commit/d7b5ea35082fdf50f3f07d7fc59f0e50f9af6dba.diff

LOG: Revert "[msan] Improve packed multiply-add instrumentation (#152941)"

This reverts commit ba603b5e4d44f1a25207a2a00196471d2ba93424.

Added: 
    

Modified: 
    llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
    llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
    llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
    llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
    llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
    llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
    llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
    llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4cef57d43f203..21bd4164385ab 100644

--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3641,10 +3641,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an MMX-sized (64-bit) vector type, or optionally, other sized
-  // vectors.
-  Type *getMMXVectorTy(unsigned EltSizeInBits,
-                       unsigned X86_MMXSizeInBits = 64) {
+  // Get an MMX-sized vector type.
+  Type *getMMXVectorTy(unsigned EltSizeInBits) {
+    const unsigned X86_MMXSizeInBits = 64;
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
     return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3844,78 +3843,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Instrument multiply-add intrinsics.
-  //
-  // e.g., Two operands:
-  //         <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
-  //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
-  //
-  //       Three operands are not implemented yet:
-  //         <4 x i32> @llvm.x86.avx512.vpdpbusd.128
-  //                       (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
-  //         (the result of multiply-add'ing %a and %b is accumulated with %s)
-  void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
-                                  unsigned EltSizeInBits = 0) {
+  // Instrument multiply-add intrinsic.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+                                  unsigned MMXEltSizeInBits = 0) {
+    Type *ResTy =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
     IRBuilder<> IRB(&I);
-
-    [[maybe_unused]] FixedVectorType *ReturnType =
-        cast<FixedVectorType>(I.getType());
-    assert(isa<FixedVectorType>(ReturnType));
-
-    assert(I.arg_size() == 2);
-
-    // Vectors A and B, and shadows
-    Value *Va = I.getOperand(0);
-    Value *Vb = I.getOperand(1);
-
-    Value *Sa = getShadow(&I, 0);
-    Value *Sb = getShadow(&I, 1);
-
-    FixedVectorType *ParamType =
-        cast<FixedVectorType>(I.getArgOperand(0)->getType());
-    assert(ParamType == I.getArgOperand(1)->getType());
-
-    assert(ParamType->getPrimitiveSizeInBits() ==
-           ReturnType->getPrimitiveSizeInBits());
-
-    // Step 1: instrument multiplication of corresponding vector elements
-    if (EltSizeInBits) {
-      ParamType = cast<FixedVectorType>(
-          getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
-
-      Va = IRB.CreateBitCast(Va, ParamType);
-      Vb = IRB.CreateBitCast(Vb, ParamType);
-
-      Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType));
-      Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType));
-    } else {
-      assert(ParamType->getNumElements() ==
-             ReturnType->getNumElements() * ReductionFactor);
-    }
-
-    Value *Sab = IRB.CreateOr(Sa, Sb);
-
-    // Multiplying an uninitialized / element by zero results in an initialized
-    // element.
-    Value *Zero = Constant::getNullValue(Va->getType());
-    Value *VaNotZero = IRB.CreateICmpNE(Va, Zero);
-    Value *VbNotZero = IRB.CreateICmpNE(Vb, Zero);
-    Value *VaAndVbNotZero = IRB.CreateAnd(VaNotZero, VbNotZero);
-
-    // After multiplying e.g., <8 x i16> %a, <8 x i16> %b, we should have
-    // <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
-    Sab = IRB.CreateAnd(Sab, IRB.CreateSExt(VaAndVbNotZero, Sab->getType()));
-
-    // Step 2: instrument horizontal add
-    // e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
-    //                <16 x i8> into <4 x i8>  (reduction factor == 4)
-    Value *OutShadow = horizontalReduce(I, ReductionFactor, Sab, nullptr);
-
-    // Extend to <4 x i32>.
-    // For MMX, cast it back to <1 x i64>.
-    OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
-
-    setShadow(&I, OutShadow);
+    auto *Shadow0 = getShadow(&I, 0);
+    auto *Shadow1 = getShadow(&I, 1);
+    Value *S = IRB.CreateOr(Shadow0, Shadow1);
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
     setOriginForNaryOp(I);
   }
 
@@ -5450,28 +5391,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorSadIntrinsic(I);
       break;
 
-    // Multiply and Add Packed Words
-    //   < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
-    //   < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
-
-    // Multiply and Add Packed Signed and Unsigned Bytes
-    //   < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
-    //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+      handleVectorPmaddIntrinsic(I);
       break;
 
-    // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, 8);
       break;
 
-    // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, 16);
       break;
 
     case Intrinsic::x86_sse_cmp_ss:

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index ab05e10ccd3c9..f916130fe53e5 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -141,16 +141,10 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i16> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP10]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -684,16 +678,10 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i8> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -719,16 +707,10 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i8> [[TMP8]], [[TMP17]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index a0341c67b1365..ac3bb56719038 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1687,27 +1687,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
-; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i16> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = sext <4 x i1> [[TMP31]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = and <4 x i16> [[TMP22]], [[TMP32]]
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <2 x i16> [[TMP26]] to i32
-; CHECK-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP27]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
-; CHECK-NEXT:    [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <1 x i64> [[TMP33]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP34]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3326,27 +3315,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = sext <8 x i1> [[TMP34]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP14]], [[TMP35]]
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i8> [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP29]] to i64
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
-; CHECK-NEXT:    [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <1 x i64> [[TMP36]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP28]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index fe1245553c116..8f915a59db8e5 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -763,16 +763,10 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <8 x i1> [[TMP11]] to <8 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i16> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index bf87027b056fa..5cc56baf0e0de 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -149,17 +149,11 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP5]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i16> [[TMP4]], [[TMP14]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP11]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -720,17 +714,11 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP5]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i8> [[TMP4]], [[TMP14]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -746,7 +734,7 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
@@ -756,17 +744,11 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i8> [[TMP9]], [[TMP18]]
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index e0e75a91cbd2e..0a3efaaea149f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1730,27 +1730,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i16> [[TMP11]], [[TMP22]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = sext <4 x i1> [[TMP32]] to <4 x i16>
-; CHECK-NEXT:    [[TMP24:%.*]] = and <4 x i16> [[TMP23]], [[TMP33]]
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i16> [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
-; CHECK-NEXT:    [[TMP34:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <1 x i64> [[TMP34]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP35]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3412,27 +3401,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i8> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = sext <8 x i1> [[TMP35]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP26]], [[TMP36]]
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i8> [[TMP29]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP24]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i64 [[TMP30]] to <1 x i64>
-; CHECK-NEXT:    [[TMP31:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[TMP32]] to <8 x i8>
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP31]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index 5edc13b7abbc4..e771e60e2f294 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -800,17 +800,11 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP5]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i16> [[TMP4]], [[TMP14]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 21996b13a9961..d614bb85d8584 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -18,15 +18,9 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[A]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i16> [[TMP2]], [[TMP11]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[C]]
@@ -45,24 +39,13 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i8> [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x i8> [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
-; CHECK-NEXT:    store <1 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <1 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[C]]
 ;
 entry: