[compiler-rt] 2b75ff1 - [msan] Reland with even more improvement: Improve packed multiply-add instrumentation (#153353)

Fri Aug 15 16:35:46 PDT 2025

Author: Thurston Dang
Date: 2025-08-15T16:35:42-07:00
New Revision: 2b75ff192d03033c136f7641bb93ea52fec54c14

URL: https://github.com/llvm/llvm-project/commit/2b75ff192d03033c136f7641bb93ea52fec54c14
DIFF: https://github.com/llvm/llvm-project/commit/2b75ff192d03033c136f7641bb93ea52fec54c14.diff

LOG: [msan] Reland with even more improvement: Improve packed multiply-add instrumentation (#153353)

This reverts commit cf002847a464c004a57ca4777251b1aafc33d958 i.e.,
relands ba603b5e4d44f1a25207a2a00196471d2ba93424. It was reverted
because it was subtly wrong: multiplying an uninitialized zero should
not result in an initialized zero.

This reland fixes the issue by using instrumentation analogous to
visitAnd (bitwise AND of an initialized zero and an uninitialized value
results in an initialized value). Additionally, this reland expands a
test case; fixes the commit message; and optimizes the change to avoid
the need for horizontalReduce.

The current instrumentation has false positives: it does not take into
account that multiplying an initialized zero value with an uninitialized
value results in an initialized zero value This change fixes the issue
during the multiplication step. The horizontal add step is modeled using
bitwise OR.
    
Future work can apply this improved handler to the AVX512 equivalent
intrinsics (x86_avx512_pmaddw_d_512, x86_avx512_pmaddubs_w_512.) and AVX
VNNI intrinsics.

Added: 
    

Modified: 
    compiler-rt/lib/msan/tests/msan_test.cpp
    llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
    llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
    llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
    llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
    llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
    llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
    llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
    llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp
index d1c481483dfad..b0d8409d97ffc 100644

--- a/compiler-rt/lib/msan/tests/msan_test.cpp
+++ b/compiler-rt/lib/msan/tests/msan_test.cpp
@@ -4271,14 +4271,39 @@ TEST(VectorSadTest, sse2_psad_bw) {
 }
 
 TEST(VectorMaddTest, mmx_pmadd_wd) {
-  V4x16 a = {Poisoned<U2>(), 1, 2, 3};
+  V4x16 a = {Poisoned<U2>(0), 1, 2, 3};
   V4x16 b = {100, 101, 102, 103};
   V2x32 c = _mm_madd_pi16(a, b);
+  // Multiply step:
+  //    {Poison * 100, 1 * 101, 2 * 102, 3 * 103}
+  // == {Poison,       1 * 101, 2 * 102, 3 * 103}
+  //    Notice that for the poisoned value, we ignored the concrete zero value.
+  //
+  // Horizontal add step:
+  //    {Poison + 1 * 101, 2 * 102 + 3 * 103}
+  // == {Poison,           2 * 102 + 3 * 103}
 
   EXPECT_POISONED(c[0]);
   EXPECT_NOT_POISONED(c[1]);
 
   EXPECT_EQ((unsigned)(2 * 102 + 3 * 103), c[1]);
+
+  V4x16 d = {Poisoned<U2>(0), 1, 0, 3};
+  V4x16 e = {100, 101, Poisoned<U2>(102), 103};
+  V2x32 f = _mm_madd_pi16(d, e);
+  // Multiply step:
+  //    {Poison * 100, 1 * 101, 0 * Poison, 3 * 103}
+  // == {Poison,       1 * 101, 0         , 3 * 103}
+  //    Notice that 0 * Poison == 0.
+  //
+  // Horizontal add step:
+  //    {Poison + 1 * 101, 0 + 3 * 103}
+  // == {Poison,           3 * 103}
+
+  EXPECT_POISONED(f[0]);
+  EXPECT_NOT_POISONED(f[1]);
+
+  EXPECT_EQ((unsigned)(3 * 103), f[1]);
 }
 
 TEST(VectorCmpTest, mm_cmpneq_ps) {

diff  --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 21bd4164385ab..3ecace5cfe6e4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3641,9 +3641,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an MMX-sized vector type.
-  Type *getMMXVectorTy(unsigned EltSizeInBits) {
-    const unsigned X86_MMXSizeInBits = 64;
+  // Get an MMX-sized (64-bit) vector type, or optionally, other sized
+  // vectors.
+  Type *getMMXVectorTy(unsigned EltSizeInBits,
+                       unsigned X86_MMXSizeInBits = 64) {
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
     return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3843,20 +3844,109 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Instrument multiply-add intrinsic.
-  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
-                                  unsigned MMXEltSizeInBits = 0) {
-    Type *ResTy =
-        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
+  // Instrument multiply-add intrinsics.
+  //
+  // e.g., Two operands:
+  //         <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
+  //
+  //       Two operands which require an EltSizeInBits override:
+  //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
+  //
+  //       Three operands are not implemented yet:
+  //         <4 x i32> @llvm.x86.avx512.vpdpbusd.128
+  //                       (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+  //         (the result of multiply-add'ing %a and %b is accumulated with %s)
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+                                  unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
-    auto *Shadow0 = getShadow(&I, 0);
-    auto *Shadow1 = getShadow(&I, 1);
-    Value *S = IRB.CreateOr(Shadow0, Shadow1);
-    S = IRB.CreateBitCast(S, ResTy);
-    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
-                       ResTy);
-    S = IRB.CreateBitCast(S, getShadowTy(&I));
-    setShadow(&I, S);
+
+    [[maybe_unused]] FixedVectorType *ReturnType =
+        cast<FixedVectorType>(I.getType());
+    assert(isa<FixedVectorType>(ReturnType));
+
+    assert(I.arg_size() == 2);
+
+    // Vectors A and B, and shadows
+    Value *Va = I.getOperand(0);
+    Value *Vb = I.getOperand(1);
+
+    Value *Sa = getShadow(&I, 0);
+    Value *Sb = getShadow(&I, 1);
+
+    FixedVectorType *ParamType =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType());
+    assert(ParamType == I.getArgOperand(1)->getType());
+
+    assert(ParamType->getPrimitiveSizeInBits() ==
+           ReturnType->getPrimitiveSizeInBits());
+
+    FixedVectorType *ImplicitReturnType = ReturnType;
+    // Step 1: instrument multiplication of corresponding vector elements
+    if (EltSizeInBits) {
+      ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy(
+          EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
+      ParamType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+
+      Va = IRB.CreateBitCast(Va, ParamType);
+      Vb = IRB.CreateBitCast(Vb, ParamType);
+
+      Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType));
+      Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType));
+    } else {
+      assert(ParamType->getNumElements() ==
+             ReturnType->getNumElements() * ReductionFactor);
+    }
+
+    // Multiplying an *initialized* zero by an uninitialized element results in
+    // an initialized zero element.
+    //
+    // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+    // results in an unpoisoned value. We can therefore adapt the visitAnd()
+    // instrumentation:
+    //   OutShadow =   (SaNonZero & SbNonZero)
+    //               | (VaNonZero & SbNonZero)
+    //               | (SaNonZero & VbNonZero)
+    //   where non-zero is checked on a per-element basis (not per bit).
+    Value *SZero = Constant::getNullValue(Va->getType());
+    Value *VZero = Constant::getNullValue(Sa->getType());
+    Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
+    Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
+    Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
+    Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
+
+    Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+    Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+    Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+    // Each element of the vector is represented by a single bit (poisoned or
+    // not) e.g., <8 x i1>.
+    Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+
+    // Extend <8 x i1> to <8 x i16>.
+    // (The real pmadd intrinsic would have computed intermediate values of
+    // <8 x i32>, but that is irrelevant for our shadow purposes because we
+    // consider each element to be either fully initialized or fully
+    // uninitialized.)
+    And = IRB.CreateSExt(And, Sa->getType());
+
+    // Step 2: instrument horizontal add
+    // We don't need bit-precise horizontalReduce because we only want to check
+    // if each pair of elements is fully zero.
+    // Cast to <4 x i32>.
+    Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType);
+
+    // Compute <4 x i1>, then extend back to <4 x i32>.
+    Value *OutShadow = IRB.CreateSExt(
+        IRB.CreateICmpNE(Horizontal,
+                         Constant::getNullValue(Horizontal->getType())),
+        ImplicitReturnType);
+
+    // For MMX, cast it back to the required fake return type (<1 x i64>).
+    if (EltSizeInBits)
+      OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
+
+    setShadow(&I, OutShadow);
     setOriginForNaryOp(I);
   }
 
@@ -5391,19 +5481,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorSadIntrinsic(I);
       break;
 
+    // Multiply and Add Packed Words
+    //   < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+    //
+    // Multiply and Add Packed Signed and Unsigned Bytes
+    //   < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+    //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
       break;
 
+    // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, 8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
       break;
 
+    // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, 16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
       break;
 
     case Intrinsic::x86_sse_cmp_ss:

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index f916130fe53e5..cc07958bd9f26 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -140,11 +140,20 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[TMP7]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -677,11 +686,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP7]] to <16 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -706,11 +724,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP17]], [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP9]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP16]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <32 x i8> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i16> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP24]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index ac3bb56719038..99eafc13b2bf1 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1687,16 +1687,30 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = and <4 x i1> [[TMP22]], [[TMP30]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <4 x i1> [[TMP29]], [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <4 x i1> [[TMP31]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP38]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i16> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = sext <2 x i1> [[TMP25]] to <2 x i32>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i32> [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
+; CHECK-NEXT:    [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <1 x i64> [[TMP33]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP34]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3315,16 +3329,30 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <8 x i1> [[TMP35]], [[TMP33]]
+; CHECK-NEXT:    [[TMP39:%.*]] = and <8 x i1> [[TMP32]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or <8 x i1> [[TMP34]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i8> [[TMP16]] to <4 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i16> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i16> [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
+; CHECK-NEXT:    [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <1 x i64> [[TMP36]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP28]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index 8f915a59db8e5..7048050180792 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -762,11 +762,20 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i1> [[TMP17]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 5cc56baf0e0de..9960b80f2856b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -149,11 +149,20 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i1> [[TMP5]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i16> [[TMP8]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -714,11 +723,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i1> [[TMP5]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP21]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -734,7 +752,7 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
@@ -744,11 +762,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <32 x i1> [[TMP10]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP12]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <32 x i8> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i16> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP25]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index 0a3efaaea149f..74822de4962b2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1730,16 +1730,30 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <4 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <4 x i1> [[TMP23]], [[TMP31]]
+; CHECK-NEXT:    [[TMP37:%.*]] = and <4 x i1> [[TMP30]], [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP32]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or <4 x i1> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <4 x i1> [[TMP39]] to <4 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[TMP24]] to <2 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <2 x i32> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = sext <2 x i1> [[TMP26]] to <2 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <2 x i32> [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
+; CHECK-NEXT:    [[TMP34:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <1 x i64> [[TMP34]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP35]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3401,16 +3415,30 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i8> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <8 x i1> [[TMP26]], [[TMP34]]
+; CHECK-NEXT:    [[TMP39:%.*]] = and <8 x i1> [[TMP33]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or <8 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i8> [[TMP16]] to <4 x i16>
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <4 x i16> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = sext <4 x i1> [[TMP28]] to <4 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[TMP42]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i64 [[TMP30]] to <1 x i64>
+; CHECK-NEXT:    [[TMP31:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[TMP32]] to <8 x i8>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP31]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index e771e60e2f294..3a37eafd78ecb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -800,11 +800,20 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i1> [[TMP5]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index d614bb85d8584..d1060fb33e1bc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -17,10 +17,19 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i1> [[TMP2]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i1> [[TMP6]], [[TMP13]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP17]] to <4 x i32>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[C]]
@@ -39,13 +48,27 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i16> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP17]], [[TMP15]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i1> [[TMP14]], [[TMP21]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i1> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i1> [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i16> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i16> [[TMP23]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
-; CHECK-NEXT:    store <1 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <1 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[C]]
 ;
 entry: