[llvm] [msan] Improve packed multiply-add instrumentation (PR #152941)

Sun Aug 10 23:52:43 PDT 2025

https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/152941

>From 538cd21f3dd082b9635f501c5ccb8df98f4bdd32 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Sun, 10 Aug 2025 19:55:42 +0000
Subject: [PATCH 01/18] [msan] Improve packed multiply-add instrumentation

The current instrumentation has false positives: if there is a single
uninitialized bit in any of the operands, the entire output is poisoned.
This does not take into account that multiplying an uninitialized bit
with zero results in an initialized zero value.

This patch improves the instrumentation by underapproximating the
multiplication as bitwise AND, with correct handling of ANDing with
zero. This is an underapproximation (no false positives) because it is
ignoring the carry from multiplication.

The horizontal add step is modeled precisely.

It also applies the handler to the AVX512 equivalents:
llvm.avx512.pmaddw.d.512, llvm.avx512.pmaddubs.w.512
---
 .../Instrumentation/MemorySanitizer.cpp       | 73 ++++++++++++--
 .../X86/avx2-intrinsics-x86.ll                | 45 ++++++---
 .../X86/avx512bw-intrinsics-upgrade.ll        | 98 +++++++++----------
 .../X86/avx512bw-intrinsics.ll                | 97 +++++++++---------
 .../MemorySanitizer/X86/mmx-intrinsics.ll     | 40 +++++---
 .../X86/sse2-intrinsics-x86.ll                | 15 ++-
 .../i386/avx2-intrinsics-i386.ll              | 47 ++++++---
 .../MemorySanitizer/i386/mmx-intrinsics.ll    | 40 +++++---
 .../i386/sse2-intrinsics-i386.ll              | 15 ++-
 .../MemorySanitizer/vector_arith.ll           | 30 ++++--
 10 files changed, 311 insertions(+), 189 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 6e8138725375a..0bf1bc587c62a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2865,6 +2865,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitMul(BinaryOperator &I) {
+    // TODO: this can only handle zero bits that are part of statically-known
+    //       constants. Consider under-approximating the multiplication as AND
+    //       (which ignores the carry), and using the visitAnd() logic.
     Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
     Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
     if (constOp0 && !constOp1)
@@ -3827,19 +3830,67 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   // Instrument multiply-add intrinsic.
+  //
+  // e.g., <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
+  //       <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
   void handleVectorPmaddIntrinsic(IntrinsicInst &I,
                                   unsigned MMXEltSizeInBits = 0) {
-    Type *ResTy =
-        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
     IRBuilder<> IRB(&I);
-    auto *Shadow0 = getShadow(&I, 0);
-    auto *Shadow1 = getShadow(&I, 1);
-    Value *S = IRB.CreateOr(Shadow0, Shadow1);
-    S = IRB.CreateBitCast(S, ResTy);
-    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
-                       ResTy);
-    S = IRB.CreateBitCast(S, getShadowTy(&I));
-    setShadow(&I, S);
+
+    Type *ReturnType =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
+    assert(isa<FixedVectorType>(ReturnType));
+
+    assert(I.arg_size() == 2);
+    [[maybe_unused]] FixedVectorType *ParamType =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType());
+    assert(ParamType == I.getArgOperand(1)->getType());
+
+    if (!MMXEltSizeInBits)
+      assert(ParamType->getNumElements() ==
+             2 * cast<FixedVectorType>(ReturnType)->getNumElements());
+
+    assert(ParamType->getPrimitiveSizeInBits() ==
+           ReturnType->getPrimitiveSizeInBits());
+
+    // Step 1: multiplication of corresponding vector elements
+    // We want to take into account the fact that multiplying zero by an
+    // uninitialized bit results in an initialized value of zero.
+    // We under-approximate multiplication using the same logic as visitAnd().
+    // This ignores the carrying that may happen during multiplication.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+
+    // After multiplying e.g., <8 x i16> %a, <8 x i16> %b, we have
+    // <8 x i16> %ab.
+    Value *ShadowAB = IRB.CreateOr({S1S2, V1S2, S1V2});
+    // For MMX, %ab has a misleading type e.g., <1 x i64>.
+    if (MMXEltSizeInBits)
+      ShadowAB = IRB.CreateBitCast(ShadowAB, getMMXVectorTy(MMXEltSizeInBits));
+
+    // Step 2: pairwise/horizontal add
+    // Handle it similarly to handlePairwiseShadowOrIntrinsic().
+    unsigned TotalNumElems =
+        cast<FixedVectorType>(ReturnType)->getNumElements() * 2;
+    SmallVector<int, 8> EvenMask;
+    SmallVector<int, 8> OddMask;
+    for (unsigned X = 0; X < TotalNumElems - 1; X += 2) {
+      EvenMask.push_back(X);
+      OddMask.push_back(X + 1);
+    }
+    Value *EvenShadow = IRB.CreateShuffleVector(ShadowAB, EvenMask);
+    Value *OddShadow = IRB.CreateShuffleVector(ShadowAB, OddMask);
+
+    Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
+    OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
+
+    setShadow(&I, OrShadow);
     setOriginForNaryOp(I);
   }
 
@@ -5378,6 +5429,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
+    case Intrinsic::x86_avx512_pmaddw_d_512:
+    case Intrinsic::x86_avx512_pmaddubs_w_512:
       handleVectorPmaddIntrinsic(I);
       break;
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index f916130fe53e5..3b38fcba35f98 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -140,11 +140,16 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i16> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i16> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i16> [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP10]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -677,11 +682,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i8> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i8> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i8> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <32 x i8> [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -706,11 +716,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i8> [[_MSLD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i8> [[A0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i8> [[_MSLD]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i8> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i8> [[TMP16]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index 02df9c49a010b..54e9939ace7c3 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -7,8 +7,6 @@
 ; - llvm.x86.avx512.dbpsadbw.512
 ; - llvm.x86.avx512.packssdw.512, llvm.x86.avx512.packsswb.512
 ; - llvm.x86.avx512.packusdw.512, llvm.x86.avx512.packuswb.512
-; - llvm.x86.avx512.pmaddubs.w.512
-; - llvm.x86.avx512.pmaddw.d.512
 ;
 ; Heuristically handled:
 ; - llvm.sadd.sat.v32i16, llvm.sadd.sat.v64i8
@@ -4930,18 +4928,17 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <64 x i8> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i8> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <32 x i8> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
@@ -4955,22 +4952,21 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i8> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <32 x i8> [[TMP20]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP21]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]]
@@ -4988,18 +4984,17 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <32 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i16> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i16> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i16> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
@@ -5013,22 +5008,21 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i16> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i16> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP21]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
index 78c272c7b2c5a..5ec8d054de381 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
@@ -9,7 +9,6 @@
 ; - llvm.x86.avx512.mask.pmov.wb.mem.512
 ; - llvm.x86.avx512.packssdw.512, llvm.x86.avx512.packsswb.512
 ; - llvm.x86.avx512.packusdw.512, llvm.x86.avx512.packuswb.512
-; - llvm.x86.avx512.pmaddubs.w.512, llvm.x86.avx512.pmaddw.d.512
 ; - llvm.x86.avx512.psad.bw.512
 ;
 ; Heuristically handled:
@@ -2204,18 +2203,17 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
-; CHECK-NEXT:    store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <64 x i8> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i8> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <32 x i8> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1)
@@ -2229,22 +2227,21 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i8> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <32 x i8> [[TMP20]] to <32 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP21]], <32 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]]
@@ -2264,18 +2261,17 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <32 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i16> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i16> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i16> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1)
@@ -2289,22 +2285,21 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i16> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i16> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP21]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index ac3bb56719038..9717471564bc7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1687,11 +1687,18 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <1 x i64> [[MMX_VAR_I]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <1 x i64> [[TMP6]], [[MMX_VAR1_I]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <1 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <1 x i64> [[TMP22]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <2 x i16> [[TMP26]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
@@ -3315,16 +3322,23 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i64> [[TMP22]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <1 x i64> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <1 x i64> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <1 x i64> [[TMP13]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i8> [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index 8f915a59db8e5..afd8c7a58b6ee 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -762,11 +762,16 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i16> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i16> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i16> [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 5cc56baf0e0de..f92f7ef3fa4a9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -149,11 +149,16 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i16> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i16> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i16> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i16> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP11]] to <8 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -714,11 +719,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i8> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i8> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
@@ -734,7 +744,7 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
@@ -744,11 +754,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i8> [[_MSLD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i8> [[A0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i8> [[_MSLD]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i8> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i8> [[TMP12]], [[TMP17]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index 0a3efaaea149f..dc9173a8b2a18 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1730,11 +1730,18 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <1 x i64> [[MMX_VAR_I]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i64> [[TMP6]], [[MMX_VAR1_I]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <1 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <1 x i64> [[TMP22]], [[TMP11]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i16> [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
@@ -3401,16 +3408,23 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <1 x i64> [[TMP22]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <1 x i64> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <1 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <1 x i64> [[TMP26]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i8> [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP30]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i64 [[TMP31]] to <1 x i64>
+; CHECK-NEXT:    [[TMP32:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[TMP32]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index e771e60e2f294..d4c2d8bb6749f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -800,11 +800,16 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i16> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i16> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index d614bb85d8584..e9b5462bc85cb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -17,10 +17,15 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[A]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i16> [[TMP0]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i16> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[TMP9]] to <4 x i32>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[C]]
@@ -39,11 +44,18 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i16> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <1 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <1 x i64> [[A]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <1 x i64> [[TMP0]], [[B]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <1 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <1 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP13]] to <8 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i8> [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP12]] to <1 x i64>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
 ; CHECK-NEXT:    store <1 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[C]]

>From 2ccc07f7781f74e069b6c1ab76f0af2f657adadd Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Sun, 10 Aug 2025 21:00:24 +0000
Subject: [PATCH 02/18] Update comment

---
 llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 0bf1bc587c62a..f938b0acaa0be 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3856,8 +3856,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Step 1: multiplication of corresponding vector elements
     // We want to take into account the fact that multiplying zero by an
     // uninitialized bit results in an initialized value of zero.
-    // We under-approximate multiplication using the same logic as visitAnd().
-    // This ignores the carrying that may happen during multiplication.
+    // We under-approximate multiplication by treating it as bitwise AND; this
+    // has no false positives but substantial false negatives. We then
+    // compute the shadow using the same logic as visitAnd().
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
     Value *V1 = I.getOperand(0);

>From d952804fd9353a0924af7ae3bf380536cdaee4d7 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 00:20:56 +0000
Subject: [PATCH 03/18] Only allow elements that are exactly zero to clear
 shadow

---
 .../Instrumentation/MemorySanitizer.cpp       | 33 +++++++-------
 .../X86/avx2-intrinsics-x86.ll                | 33 +++++++-------
 .../X86/avx512bw-intrinsics-upgrade.ll        | 44 ++++++++++---------
 .../X86/avx512bw-intrinsics.ll                | 44 ++++++++++---------
 .../MemorySanitizer/X86/mmx-intrinsics.ll     | 22 +++++-----
 .../X86/sse2-intrinsics-x86.ll                | 11 ++---
 .../i386/avx2-intrinsics-i386.ll              | 33 +++++++-------
 .../MemorySanitizer/i386/mmx-intrinsics.ll    | 22 +++++-----
 .../i386/sse2-intrinsics-i386.ll              | 11 ++---
 .../MemorySanitizer/vector_arith.ll           | 22 +++++-----
 10 files changed, 148 insertions(+), 127 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index f938b0acaa0be..340301867bdee 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2865,9 +2865,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitMul(BinaryOperator &I) {
-    // TODO: this can only handle zero bits that are part of statically-known
-    //       constants. Consider under-approximating the multiplication as AND
-    //       (which ignores the carry), and using the visitAnd() logic.
     Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
     Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
     if (constOp0 && !constOp1)
@@ -3854,28 +3851,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
            ReturnType->getPrimitiveSizeInBits());
 
     // Step 1: multiplication of corresponding vector elements
-    // We want to take into account the fact that multiplying zero by an
-    // uninitialized bit results in an initialized value of zero.
-    // We under-approximate multiplication by treating it as bitwise AND; this
-    // has no false positives but substantial false negatives. We then
-    // compute the shadow using the same logic as visitAnd().
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
     Value *V1 = I.getOperand(0);
     Value *V2 = I.getOperand(1);
 
-    Value *S1S2 = IRB.CreateAnd(S1, S2);
-    Value *V1S2 = IRB.CreateAnd(V1, S2);
-    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    Value *S1S2 = IRB.CreateOr(S1, S2);
+
+    // We allow the special case of multiplying where multiplying an uninitialized
+    // element by zero results in an initialized element.
+    Value *Zero = Constant::getNullValue(V1->getType());
+    Value *V1NotZero = IRB.CreateICmpNE(V1, Zero);
+    Value *V2NotZero = IRB.CreateICmpNE(V2, Zero);
+    Value *V1AndV2NotZero = IRB.CreateAnd(V1NotZero, V2NotZero);
+
+    // After multiplying e.g., <8 x i16> %a, <8 x i16> %b, we should have
+    // <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
+    S1S2 = IRB.CreateAnd(S1S2, IRB.CreateSExt(V1AndV2NotZero, S1S2->getType()));
 
-    // After multiplying e.g., <8 x i16> %a, <8 x i16> %b, we have
-    // <8 x i16> %ab.
-    Value *ShadowAB = IRB.CreateOr({S1S2, V1S2, S1V2});
     // For MMX, %ab has a misleading type e.g., <1 x i64>.
     if (MMXEltSizeInBits)
-      ShadowAB = IRB.CreateBitCast(ShadowAB, getMMXVectorTy(MMXEltSizeInBits));
+      S1S2 = IRB.CreateBitCast(S1S2, getMMXVectorTy(MMXEltSizeInBits));
 
     // Step 2: pairwise/horizontal add
+    // Collapse <8 x i16> into <4 x i32>
     // Handle it similarly to handlePairwiseShadowOrIntrinsic().
     unsigned TotalNumElems =
         cast<FixedVectorType>(ReturnType)->getNumElements() * 2;
@@ -3885,8 +3884,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       EvenMask.push_back(X);
       OddMask.push_back(X + 1);
     }
-    Value *EvenShadow = IRB.CreateShuffleVector(ShadowAB, EvenMask);
-    Value *OddShadow = IRB.CreateShuffleVector(ShadowAB, OddMask);
+    Value *EvenShadow = IRB.CreateShuffleVector(S1S2, EvenMask);
+    Value *OddShadow = IRB.CreateShuffleVector(S1S2, OddMask);
 
     Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
     OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index 3b38fcba35f98..ab05e10ccd3c9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -140,11 +140,12 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i16> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i16> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i16> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i16> [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i16> [[TMP3]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP8]], [[TMP9]]
@@ -682,11 +683,12 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i8> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i8> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i8> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <32 x i8> [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i8> [[TMP3]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP8]], [[TMP9]]
@@ -716,11 +718,12 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i8> [[A0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i8> [[_MSLD]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i8> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i8> [[TMP16]], [[TMP10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i8> [[TMP8]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index 54e9939ace7c3..3a6d150a1ff27 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -4928,11 +4928,12 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <64 x i8> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i8> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = and <64 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <64 x i1> [[TMP6]] to <64 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <64 x i8> [[TMP3]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP8]], [[TMP9]]
@@ -4952,11 +4953,12 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <64 x i1> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <64 x i1> [[TMP8]] to <64 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i8> [[TMP5]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i8> [[TMP18]], [[TMP19]]
@@ -4984,11 +4986,12 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <32 x i16> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i16> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP6]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i16> [[TMP3]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i16> [[TMP8]], [[TMP9]]
@@ -5008,11 +5011,12 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i1> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <32 x i1> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i16> [[TMP5]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i16> [[TMP18]], [[TMP19]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
index 5ec8d054de381..1b9b9aa23b8c7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
@@ -2203,11 +2203,12 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <64 x i8> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i8> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = and <64 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <64 x i1> [[TMP6]] to <64 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <64 x i8> [[TMP3]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP8]], [[TMP9]]
@@ -2227,11 +2228,12 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <64 x i8> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <64 x i1> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <64 x i1> [[TMP8]] to <64 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i8> [[TMP5]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <32 x i8> [[TMP18]], [[TMP19]]
@@ -2261,11 +2263,12 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <32 x i16> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i16> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <32 x i1> [[TMP6]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i16> [[TMP3]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i16> [[TMP12]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i16> [[TMP8]], [[TMP9]]
@@ -2285,11 +2288,12 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i16> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i1> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <32 x i1> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i16> [[TMP5]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP17]], <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i16> [[TMP18]], [[TMP19]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 9717471564bc7..36b74bfb24255 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1687,11 +1687,12 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <1 x i64> [[MMX_VAR_I]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <1 x i64> [[TMP6]], [[MMX_VAR1_I]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <1 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <1 x i64> [[MMX_VAR_I]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <1 x i64> [[MMX_VAR1_I]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP29:%.*]] = sext <1 x i1> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP22:%.*]] = and <1 x i64> [[TMP8]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <1 x i64> [[TMP22]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
@@ -3322,11 +3323,12 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = and <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i64> [[TMP22]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <1 x i64> [[TMP21]], [[TMP23]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <1 x i64> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <1 x i64> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <1 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <1 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <1 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sext <1 x i1> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <1 x i64> [[TMP10]], [[TMP32]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index afd8c7a58b6ee..fe1245553c116 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -762,11 +762,12 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i16> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i16> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i16> [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <8 x i1> [[TMP11]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i16> [[TMP3]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index f92f7ef3fa4a9..bf87027b056fa 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -149,11 +149,12 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i16> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i16> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i16> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i16> [[TMP4]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP9]], [[TMP10]]
@@ -719,11 +720,12 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i8> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i8> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i8> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <32 x i8> [[TMP4]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP9]], [[TMP10]]
@@ -754,11 +756,12 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP9:%.*]] = and <32 x i8> [[_MSLD]], [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i8> [[A0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = and <32 x i8> [[_MSLD]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i8> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i8> [[TMP12]], [[TMP17]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP12]] to <32 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i8> [[TMP9]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index dc9173a8b2a18..ebec17f3503fb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1730,11 +1730,12 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <1 x i64> [[MMX_VAR_I]], [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i64> [[TMP6]], [[MMX_VAR1_I]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <1 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <1 x i64> [[TMP22]], [[TMP11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <1 x i64> [[MMX_VAR_I]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <1 x i64> [[MMX_VAR1_I]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <1 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sext <1 x i1> [[TMP22]] to <1 x i64>
+; CHECK-NEXT:    [[TMP23:%.*]] = and <1 x i64> [[TMP9]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
@@ -3408,11 +3409,12 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <1 x i64> [[TMP22]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = and <1 x i64> [[TMP21]], [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <1 x i64> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <1 x i64> [[TMP26]], [[TMP13]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <1 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <1 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = and <1 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP33:%.*]] = sext <1 x i1> [[TMP26]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <1 x i64> [[TMP11]], [[TMP33]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index d4c2d8bb6749f..5edc13b7abbc4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -800,11 +800,12 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i16> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i16> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[TMP5]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i16> [[TMP4]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP10]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index e9b5462bc85cb..a0a293ac5351f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -17,11 +17,12 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[A]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i16> [[TMP0]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i16> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i16> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i16> [[TMP2]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]]
@@ -44,11 +45,12 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = and <1 x i64> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = and <1 x i64> [[A]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <1 x i64> [[TMP0]], [[B]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <1 x i64> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <1 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <1 x i64> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <1 x i64> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = and <1 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <1 x i64> [[TMP2]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP13]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>

>From f2d3972c5efa0f2210ec78d908881b8ed5923974 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 01:28:40 +0000
Subject: [PATCH 04/18] Fix MMX instrumentation

---
 .../Instrumentation/MemorySanitizer.cpp       | 33 ++++++++++--------
 .../MemorySanitizer/X86/mmx-intrinsics.ll     | 34 +++++++++++--------
 .../MemorySanitizer/i386/mmx-intrinsics.ll    | 34 +++++++++++--------
 .../MemorySanitizer/vector_arith.ll           | 17 ++++++----
 4 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 340301867bdee..2cd69a5d72672 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3834,18 +3834,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                   unsigned MMXEltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
 
-    Type *ReturnType =
-        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
+    FixedVectorType *ReturnType = cast<FixedVectorType>(I.getType());
     assert(isa<FixedVectorType>(ReturnType));
 
     assert(I.arg_size() == 2);
-    [[maybe_unused]] FixedVectorType *ParamType =
+    FixedVectorType *ParamType =
         cast<FixedVectorType>(I.getArgOperand(0)->getType());
     assert(ParamType == I.getArgOperand(1)->getType());
 
-    if (!MMXEltSizeInBits)
-      assert(ParamType->getNumElements() ==
-             2 * cast<FixedVectorType>(ReturnType)->getNumElements());
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
 
     assert(ParamType->getPrimitiveSizeInBits() ==
            ReturnType->getPrimitiveSizeInBits());
@@ -3853,8 +3851,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Step 1: multiplication of corresponding vector elements
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
-    Value *V1 = I.getOperand(0);
-    Value *V2 = I.getOperand(1);
+
+    if (MMXEltSizeInBits) {
+        ReturnType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits * 2));
+        ParamType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits));
+
+        V1 = IRB.CreateBitCast(V1, ParamType);
+        V2 = IRB.CreateBitCast(V2, ParamType);
+
+        S1 = IRB.CreateBitCast(S1, getShadowTy(ParamType));
+        S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
+    }
+
+    assert(ParamType->getNumElements() ==
+           2 * ReturnType->getNumElements());
 
     Value *S1S2 = IRB.CreateOr(S1, S2);
 
@@ -3869,15 +3879,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
     S1S2 = IRB.CreateAnd(S1S2, IRB.CreateSExt(V1AndV2NotZero, S1S2->getType()));
 
-    // For MMX, %ab has a misleading type e.g., <1 x i64>.
-    if (MMXEltSizeInBits)
-      S1S2 = IRB.CreateBitCast(S1S2, getMMXVectorTy(MMXEltSizeInBits));
-
     // Step 2: pairwise/horizontal add
     // Collapse <8 x i16> into <4 x i32>
     // Handle it similarly to handlePairwiseShadowOrIntrinsic().
-    unsigned TotalNumElems =
-        cast<FixedVectorType>(ReturnType)->getNumElements() * 2;
+    unsigned TotalNumElems = ReturnType->getNumElements() * 2;
     SmallVector<int, 8> EvenMask;
     SmallVector<int, 8> OddMask;
     for (unsigned X = 0; X < TotalNumElems - 1; X += 2) {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 36b74bfb24255..bdbd02338040b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1687,13 +1687,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <1 x i64> [[MMX_VAR_I]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <1 x i64> [[MMX_VAR1_I]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = and <1 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP29:%.*]] = sext <1 x i1> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP22:%.*]] = and <1 x i64> [[TMP8]], [[TMP29]]
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <1 x i64> [[TMP22]] to <4 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i16> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sext <4 x i1> [[TMP31]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = and <4 x i16> [[TMP22]], [[TMP32]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
@@ -3323,13 +3326,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <1 x i64> [[TMP22]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <1 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <1 x i1> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP32:%.*]] = sext <1 x i1> [[TMP13]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <1 x i64> [[TMP10]], [[TMP32]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = sext <8 x i1> [[TMP34]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP14]], [[TMP35]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index ebec17f3503fb..486b3bf1e411d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1730,13 +1730,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <1 x i64> [[MMX_VAR_I]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <1 x i64> [[MMX_VAR1_I]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = and <1 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP30:%.*]] = sext <1 x i1> [[TMP22]] to <1 x i64>
-; CHECK-NEXT:    [[TMP23:%.*]] = and <1 x i64> [[TMP9]], [[TMP30]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i16> [[TMP11]], [[TMP22]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = sext <4 x i1> [[TMP32]] to <4 x i16>
+; CHECK-NEXT:    [[TMP24:%.*]] = and <4 x i16> [[TMP23]], [[TMP33]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP24]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
@@ -3409,13 +3412,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <1 x i64> [[TMP22]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <1 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = and <1 x i1> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP33:%.*]] = sext <1 x i1> [[TMP26]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <1 x i64> [[TMP11]], [[TMP33]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = sext <8 x i1> [[TMP35]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[TMP26]], [[TMP36]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index a0a293ac5351f..3a161f5bb3ac4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -45,13 +45,16 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <1 x i64> [[A]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <1 x i64> [[B]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = and <1 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <1 x i64> [[TMP2]], [[TMP14]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP13]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i8> [[TMP13]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]

>From 13b3d88a1f8aa9c3b50b963153ad945f40ee0def Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 01:53:12 +0000
Subject: [PATCH 05/18] Add comment on types

---
 .../Instrumentation/MemorySanitizer.cpp       | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 2cd69a5d72672..3a566b7505303 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3868,8 +3868,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     Value *S1S2 = IRB.CreateOr(S1, S2);
 
-    // We allow the special case of multiplying where multiplying an uninitialized
-    // element by zero results in an initialized element.
+    // Multiplying an uninitialized / element by zero results in an initialized
+    // element.
     Value *Zero = Constant::getNullValue(V1->getType());
     Value *V1NotZero = IRB.CreateICmpNE(V1, Zero);
     Value *V2NotZero = IRB.CreateICmpNE(V2, Zero);
@@ -5430,6 +5430,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorSadIntrinsic(I);
       break;
 
+    // @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+    // @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+    // @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+    // @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
+    // @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
+    // @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
+    //   @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
+    //   @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
+    //   @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
+    //   @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
+    //   @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
@@ -5439,10 +5453,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorPmaddIntrinsic(I);
       break;
 
+    // @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
       handleVectorPmaddIntrinsic(I, 8);
       break;
 
+    // @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
       handleVectorPmaddIntrinsic(I, 16);
       break;

>From 7c6bafb9b357013946e52233c5d9d849a8deb3eb Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 01:56:39 +0000
Subject: [PATCH 06/18] Add return types

---
 .../Instrumentation/MemorySanitizer.cpp       | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 3a566b7505303..ae0da50fa421f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5430,20 +5430,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorSadIntrinsic(I);
       break;
 
-    // @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
-    // @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
-    // @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
-    // @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
-    // @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
-    // @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
+    // <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+    // <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+    // <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+    // <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
+    // <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
+    // <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
-    //   @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
-    //   @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
-    //   @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
-    //   @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
-    //   @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
+    //   <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
+    //   <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
+    //   <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
@@ -5453,12 +5453,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorPmaddIntrinsic(I);
       break;
 
-    // @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
+    // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
       handleVectorPmaddIntrinsic(I, 8);
       break;
 
-    // @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
+    // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
       handleVectorPmaddIntrinsic(I, 16);
       break;

>From 771360f0bf8a6959afc37730ac648dd276d9a3af Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 03:29:44 +0000
Subject: [PATCH 07/18] Add support for packed multiply add on bytes and words

---
 .../Instrumentation/MemorySanitizer.cpp       | 129 ++++++++++++++----
 1 file changed, 104 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index ae0da50fa421f..2a33be2a6454c 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3624,9 +3624,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an MMX-sized vector type.
-  Type *getMMXVectorTy(unsigned EltSizeInBits) {
-    const unsigned X86_MMXSizeInBits = 64;
+  // Get an MMX-sized (64-bit) vector type, or optionally, other sized
+  // vectors.
+  Type *getMMXVectorTy(unsigned EltSizeInBits, unsigned X86_MMXSizeInBits = 64) {
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
     return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3830,6 +3830,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //
   // e.g., <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
   //       <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
+  //
+  // For the three-operand form:
+  //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+  // the horizontal addition is "quadwise" instead of pairwise (note the first
+  // two operands should actually be interpreted as vectors of bytes), and it
+  // is accumulated with the third operand.
   void handleVectorPmaddIntrinsic(IntrinsicInst &I,
                                   unsigned MMXEltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
@@ -3837,10 +3843,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     FixedVectorType *ReturnType = cast<FixedVectorType>(I.getType());
     assert(isa<FixedVectorType>(ReturnType));
 
-    assert(I.arg_size() == 2);
+    assert(I.arg_size() == 2 || I.arg_size() == 3);
     FixedVectorType *ParamType =
         cast<FixedVectorType>(I.getArgOperand(0)->getType());
     assert(ParamType == I.getArgOperand(1)->getType());
+    if (I.arg_size() == 3)
+      assert(ParamType == I.getArgOperand(2)->getType());
 
     Value *V1 = I.getOperand(0);
     Value *V2 = I.getOperand(1);
@@ -3853,8 +3861,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S2 = getShadow(&I, 1);
 
     if (MMXEltSizeInBits) {
-        ReturnType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits * 2));
-        ParamType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits));
+        if (I.arg_size() != 3)
+          ReturnType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits * 2, ReturnType->getPrimitiveSizeInBits()));
+
+        ParamType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
         V1 = IRB.CreateBitCast(V1, ParamType);
         V2 = IRB.CreateBitCast(V2, ParamType);
@@ -3863,8 +3873,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
     }
 
-    assert(ParamType->getNumElements() ==
-           2 * ReturnType->getNumElements());
+    if (I.arg_size() != 3)
+      assert(ParamType->getNumElements() ==
+             2 * ReturnType->getNumElements());
 
     Value *S1S2 = IRB.CreateOr(S1, S2);
 
@@ -3879,22 +3890,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
     S1S2 = IRB.CreateAnd(S1S2, IRB.CreateSExt(V1AndV2NotZero, S1S2->getType()));
 
-    // Step 2: pairwise/horizontal add
-    // Collapse <8 x i16> into <4 x i32>
+    // Step 2: horizontal add
     // Handle it similarly to handlePairwiseShadowOrIntrinsic().
-    unsigned TotalNumElems = ReturnType->getNumElements() * 2;
-    SmallVector<int, 8> EvenMask;
-    SmallVector<int, 8> OddMask;
-    for (unsigned X = 0; X < TotalNumElems - 1; X += 2) {
-      EvenMask.push_back(X);
-      OddMask.push_back(X + 1);
+    //
+    // If arg_size() == 2:
+    //   Collapse <8 x i16> into <4 x i16>
+    int ReductionFactor = 2;
+
+    if (I.arg_size() == 3)
+      // Quadwise addition
+      // Collapse <16 x i8> into <4 x i8>
+      ReductionFactor = 4;
+
+    unsigned TotalNumElems = ReturnType->getNumElements() * ReductionFactor;
+    Value *OrShadow = nullptr;
+    for (int i = 0; i < ReductionFactor; i++) {
+      SmallVector<int, 8> Mask;
+      for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
+        Mask.push_back(X + i);
+
+      Value *MaskedShadow = IRB.CreateShuffleVector(S1S2, Mask);
+
+      if (OrShadow)
+        OrShadow = IRB.CreateOr(OrShadow, MaskedShadow);
+      else
+        OrShadow = MaskedShadow;
     }
-    Value *EvenShadow = IRB.CreateShuffleVector(S1S2, EvenMask);
-    Value *OddShadow = IRB.CreateShuffleVector(S1S2, OddMask);
 
-    Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
+    // Extend to <4 x i32>
     OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
 
+    // Accumulate
+    if (I.arg_size() == 3)
+      OrShadow = IRB.CreateOr(OrShadow, getShadow(&I, 2));
+
     setShadow(&I, OrShadow);
     setOriginForNaryOp(I);
   }
@@ -5430,12 +5459,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorSadIntrinsic(I);
       break;
 
-    // <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
-    // <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
-    // <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
-    // <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
-    // <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
-    // <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
+    // Multiply and Add Packed Words
+    //   <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+    //   <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
+
+    // Multiply and Add Packed Signed and Unsigned Bytes
+    //   <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+    //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
+    //   <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
     //   <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
@@ -5463,6 +5495,53 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorPmaddIntrinsic(I, 16);
       break;
 
+    // Multiply and Add Packed Signed and Unsigned Bytes
+    //   <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    //   <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    case Intrinsic::x86_avx512_vpdpbusd_128:
+    case Intrinsic::x86_avx512_vpdpbusds_128:
+    case Intrinsic::x86_avx512_vpdpbusd_256:
+    case Intrinsic::x86_avx512_vpdpbusds_256:
+    case Intrinsic::x86_avx512_vpdpbusd_512:
+    case Intrinsic::x86_avx512_vpdpbusds_512:
+    case Intrinsic::x86_avx2_vpdpbssd_128:
+    case Intrinsic::x86_avx2_vpdpbssds_128:
+    case Intrinsic::x86_avx2_vpdpbssd_256:
+    case Intrinsic::x86_avx2_vpdpbssds_256:
+    case Intrinsic::x86_avx10_vpdpbssd_512:
+    case Intrinsic::x86_avx10_vpdpbssds_512:
+      handleVectorPmaddIntrinsic(I, 8);
+      break;
+
+    // Multiply and Add Signed Word Integers
+    //   <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    case Intrinsic::x86_avx512_vpdpwssd_128:
+    case Intrinsic::x86_avx512_vpdpwssds_128:
+    case Intrinsic::x86_avx512_vpdpwssd_256:
+    case Intrinsic::x86_avx512_vpdpwssds_256:
+    case Intrinsic::x86_avx512_vpdpwssd_512:
+    case Intrinsic::x86_avx512_vpdpwssds_512:
+      handleVectorPmaddIntrinsic(I, 16);
+      break;
+
+
     case Intrinsic::x86_sse_cmp_ss:
     case Intrinsic::x86_sse2_cmp_sd:
     case Intrinsic::x86_sse_comieq_ss:

>From d828067808e06a72e55ee31420f2992dedd4793b Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 03:49:46 +0000
Subject: [PATCH 08/18] Make ReductionFactor parameterizable

---
 .../Instrumentation/MemorySanitizer.cpp       | 53 +++++++++----------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 2a33be2a6454c..01f047550b2bd 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3834,9 +3834,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // For the three-operand form:
   //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
   // the horizontal addition is "quadwise" instead of pairwise (note the first
-  // two operands should actually be interpreted as vectors of bytes), and it
-  // is accumulated with the third operand.
-  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+  // two operands are typically interpreted as bytes or words), and it is
+  // accumulated with the third operand.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
                                   unsigned MMXEltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
 
@@ -3891,20 +3891,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     S1S2 = IRB.CreateAnd(S1S2, IRB.CreateSExt(V1AndV2NotZero, S1S2->getType()));
 
     // Step 2: horizontal add
-    // Handle it similarly to handlePairwiseShadowOrIntrinsic().
+    // e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
+    //                <16 x i8> into <4 x i8>  (reduction factor == 4)
     //
-    // If arg_size() == 2:
-    //   Collapse <8 x i16> into <4 x i16>
-    int ReductionFactor = 2;
-
-    if (I.arg_size() == 3)
-      // Quadwise addition
-      // Collapse <16 x i8> into <4 x i8>
-      ReductionFactor = 4;
+    // Handle it similarly to handlePairwiseShadowOrIntrinsic().
 
     unsigned TotalNumElems = ReturnType->getNumElements() * ReductionFactor;
     Value *OrShadow = nullptr;
-    for (int i = 0; i < ReductionFactor; i++) {
+    for (unsigned i = 0; i < ReductionFactor; i++) {
       SmallVector<int, 8> Mask;
       for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
         Mask.push_back(X + i);
@@ -5482,66 +5476,69 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_pmadd_ub_sw:
     case Intrinsic::x86_avx512_pmaddw_d_512:
     case Intrinsic::x86_avx512_pmaddubs_w_512:
-      handleVectorPmaddIntrinsic(I);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2);
       break;
 
     // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, 8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 8);
       break;
 
     // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, 16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 16);
       break;
 
     // Multiply and Add Packed Signed and Unsigned Bytes
     //   <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-    //   <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
     //   <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-    //   <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    //   <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     //   <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
     //   <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //
     //   <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
     //   <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
     //
     //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     case Intrinsic::x86_avx512_vpdpbusd_128:
-    case Intrinsic::x86_avx512_vpdpbusds_128:
     case Intrinsic::x86_avx512_vpdpbusd_256:
-    case Intrinsic::x86_avx512_vpdpbusds_256:
     case Intrinsic::x86_avx512_vpdpbusd_512:
+    case Intrinsic::x86_avx512_vpdpbusds_128:
+    case Intrinsic::x86_avx512_vpdpbusds_256:
     case Intrinsic::x86_avx512_vpdpbusds_512:
     case Intrinsic::x86_avx2_vpdpbssd_128:
-    case Intrinsic::x86_avx2_vpdpbssds_128:
     case Intrinsic::x86_avx2_vpdpbssd_256:
+    case Intrinsic::x86_avx2_vpdpbssds_128:
     case Intrinsic::x86_avx2_vpdpbssds_256:
     case Intrinsic::x86_avx10_vpdpbssd_512:
     case Intrinsic::x86_avx10_vpdpbssds_512:
-      handleVectorPmaddIntrinsic(I, 8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 4, /*EltSize=*/ 8);
       break;
 
     // Multiply and Add Signed Word Integers
     //   <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-    //   <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
     //   <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-    //   <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // Multiply and Add Signed Word Integers With Saturation
+    //   <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     case Intrinsic::x86_avx512_vpdpwssd_128:
-    case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssd_256:
-    case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssd_512:
+    case Intrinsic::x86_avx512_vpdpwssds_128:
+    case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
-      handleVectorPmaddIntrinsic(I, 16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 16);
       break;
 
-
     case Intrinsic::x86_sse_cmp_ss:
     case Intrinsic::x86_sse2_cmp_sd:
     case Intrinsic::x86_sse_comieq_ss:

>From ff477170c507c00be4653333e50a6fa08c529950 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 04:15:41 +0000
Subject: [PATCH 09/18] Generalize

---
 .../Instrumentation/MemorySanitizer.cpp       | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 01f047550b2bd..7610d2417c449 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3833,11 +3833,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //
   // For the three-operand form:
   //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-  // the horizontal addition is "quadwise" instead of pairwise (note the first
-  // two operands are typically interpreted as bytes or words), and it is
-  // accumulated with the third operand.
+  // the result of multiply-add'ing the first two operands is accumulated with
+  // the third operand.
   void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
-                                  unsigned MMXEltSizeInBits = 0) {
+                                  unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
 
     FixedVectorType *ReturnType = cast<FixedVectorType>(I.getType());
@@ -3860,11 +3859,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
 
-    if (MMXEltSizeInBits) {
+    if (EltSizeInBits) {
         if (I.arg_size() != 3)
-          ReturnType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits * 2, ReturnType->getPrimitiveSizeInBits()));
+          ReturnType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits * ReductionFactor, ReturnType->getPrimitiveSizeInBits()));
 
-        ParamType = cast<FixedVectorType>(getMMXVectorTy(MMXEltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+        ParamType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
         V1 = IRB.CreateBitCast(V1, ParamType);
         V2 = IRB.CreateBitCast(V2, ParamType);
@@ -3873,10 +3872,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
     }
 
-    if (I.arg_size() != 3)
-      assert(ParamType->getNumElements() ==
-             2 * ReturnType->getNumElements());
-
     Value *S1S2 = IRB.CreateOr(S1, S2);
 
     // Multiplying an uninitialized / element by zero results in an initialized
@@ -3896,10 +3891,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //
     // Handle it similarly to handlePairwiseShadowOrIntrinsic().
 
-    unsigned TotalNumElems = ReturnType->getNumElements() * ReductionFactor;
+    assert(ParamType->getNumElements() ==
+           ReturnType->getNumElements() * ReductionFactor);
+
+    unsigned TotalNumElems = ParamType->getNumElements();
     Value *OrShadow = nullptr;
     for (unsigned i = 0; i < ReductionFactor; i++) {
-      SmallVector<int, 8> Mask;
+      SmallVector<int, 16> Mask;
       for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
         Mask.push_back(X + i);
 
@@ -3912,7 +3910,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
 
     // Extend to <4 x i32>
-    OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
+    OrShadow = IRB.CreateZExt(OrShadow, getShadowTy(&I));
 
     // Accumulate
     if (I.arg_size() == 3)

>From 66a1c7d3a8819b8ef24dbc18504fcfe0ae08dbb2 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 04:33:01 +0000
Subject: [PATCH 10/18] More comments, assertions. Fix cast.

---
 .../Instrumentation/MemorySanitizer.cpp       | 78 +++++++++++++------
 1 file changed, 55 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 7610d2417c449..75d69c7f1031b 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3846,8 +3846,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     FixedVectorType *ParamType =
         cast<FixedVectorType>(I.getArgOperand(0)->getType());
     assert(ParamType == I.getArgOperand(1)->getType());
-    if (I.arg_size() == 3)
+    if (I.arg_size() == 3) {
+      assert(ParamType == ReturnType);
       assert(ParamType == I.getArgOperand(2)->getType());
+    }
 
     Value *V1 = I.getOperand(0);
     Value *V2 = I.getOperand(1);
@@ -3855,7 +3857,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     assert(ParamType->getPrimitiveSizeInBits() ==
            ReturnType->getPrimitiveSizeInBits());
 
-    // Step 1: multiplication of corresponding vector elements
+    // Step 1: instrument multiplication of corresponding vector elements
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
 
@@ -3885,7 +3887,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
     S1S2 = IRB.CreateAnd(S1S2, IRB.CreateSExt(V1AndV2NotZero, S1S2->getType()));
 
-    // Step 2: horizontal add
+    // Step 2: instrument horizontal add
     // e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
     //                <16 x i8> into <4 x i8>  (reduction factor == 4)
     //
@@ -3910,7 +3912,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
 
     // Extend to <4 x i32>
-    OrShadow = IRB.CreateZExt(OrShadow, getShadowTy(&I));
+    OrShadow = IRB.CreateZExt(OrShadow, ReturnType);
 
     // Accumulate
     if (I.arg_size() == 3)
@@ -5452,20 +5454,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
 
     // Multiply and Add Packed Words
-    //   <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
-    //   <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
+    //   < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
     //   <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
 
     // Multiply and Add Packed Signed and Unsigned Bytes
-    //   <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
+    //   < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
     //   <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
     //   <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
+    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
+    //   < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
-    //   <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
+    //   < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
     //   <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
     //   <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
     case Intrinsic::x86_sse2_pmadd_wd:
@@ -5488,22 +5490,37 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
 
     // Multiply and Add Packed Signed and Unsigned Bytes
-    //   <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-    //   <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusd.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
-    //   <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-    //   <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
-    //   <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-    //   <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
     //
-    //   <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-    //   <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssd.256 (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //
-    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512 (<16 x i32>, <16 x i32>, <16 x i32>)
     //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
     case Intrinsic::x86_avx512_vpdpbusd_128:
     case Intrinsic::x86_avx512_vpdpbusd_256:
     case Intrinsic::x86_avx512_vpdpbusd_512:
@@ -5520,14 +5537,29 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
 
     // Multiply and Add Signed Word Integers
-    //   <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-    //   <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     // Multiply and Add Signed Word Integers With Saturation
-    //   <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-    //   <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //
+    // These intrinsics are auto-upgraded into non-masked forms:
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
     case Intrinsic::x86_avx512_vpdpwssd_128:
     case Intrinsic::x86_avx512_vpdpwssd_256:
     case Intrinsic::x86_avx512_vpdpwssd_512:

>From 59ee901272df966b9d69cb42441b8a857fd92b76 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 05:13:43 +0000
Subject: [PATCH 11/18] Add TODO for BF16

---
 llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 75d69c7f1031b..48e80c9b379aa 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5569,6 +5569,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 16);
       break;
 
+    // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single Precision
+    //   <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+    //   <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>)
+    //   <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>)
+    // handleVectorPmaddIntrinsic() currently only handles integer types.
+
     case Intrinsic::x86_sse_cmp_ss:
     case Intrinsic::x86_sse2_cmp_sd:
     case Intrinsic::x86_sse_comieq_ss:

>From ca4d4e3335c0c4c86c5d3097e263f817fbf0141f Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 05:19:28 +0000
Subject: [PATCH 12/18] Update comment

---
 llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 48e80c9b379aa..5ea007bd88431 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5494,17 +5494,18 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //   < 8 x i32> @llvm.x86.avx512.vpdpbusd.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
+    // Multiply and Add Unsigned and Signed Bytes With Saturation
     //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpbusds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 (< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //
     //   < 8 x i32> @llvm.x86.avx2.vpdpbssd.256 (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
     //   < 8 x i32> @llvm.x86.avx2.vpdpbssds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //
-    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512 (<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:

>From ef6d0084056a822cc5231e6240c898665c431fc8 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 05:43:28 +0000
Subject: [PATCH 13/18] Fix parameter order for 3-operand forms

---
 .../Instrumentation/MemorySanitizer.cpp       | 34 ++++++++++++-------
 .../MemorySanitizer/X86/mmx-intrinsics.ll     | 24 ++++++-------
 .../MemorySanitizer/i386/mmx-intrinsics.ll    | 24 ++++++-------
 .../MemorySanitizer/vector_arith.ll           |  8 ++---
 4 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 5ea007bd88431..c53d732ac9870 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3826,15 +3826,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Instrument multiply-add intrinsic.
+  // Instrument multiply-add intrinsics.
   //
-  // e.g., <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
-  //       <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
+  // e.g., Two operands:
+  //         <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
+  //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
   //
-  // For the three-operand form:
-  //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-  // the result of multiply-add'ing the first two operands is accumulated with
-  // the third operand.
+  //       Three operands:
+  //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+  //       (the result of multiply-add'ing %a and %b is accumulated with %s)
   void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
                                   unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
@@ -3846,13 +3846,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     FixedVectorType *ParamType =
         cast<FixedVectorType>(I.getArgOperand(0)->getType());
     assert(ParamType == I.getArgOperand(1)->getType());
+
+    Value *V1;
+    Value *V2;
+
     if (I.arg_size() == 3) {
       assert(ParamType == ReturnType);
       assert(ParamType == I.getArgOperand(2)->getType());
-    }
 
-    Value *V1 = I.getOperand(0);
-    Value *V2 = I.getOperand(1);
+      V1 = I.getOperand(1);
+      V2 = I.getOperand(2);
+    } else {
+      V1 = I.getOperand(0);
+      V2 = I.getOperand(1);
+    }
 
     assert(ParamType->getPrimitiveSizeInBits() ==
            ReturnType->getPrimitiveSizeInBits());
@@ -3911,12 +3918,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         OrShadow = MaskedShadow;
     }
 
-    // Extend to <4 x i32>
-    OrShadow = IRB.CreateZExt(OrShadow, ReturnType);
+    // Extend to <4 x i32>.
+    // For MMX, cast it back to <1 x i64>.
+    OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
 
     // Accumulate
     if (I.arg_size() == 3)
-      OrShadow = IRB.CreateOr(OrShadow, getShadow(&I, 2));
+      OrShadow = IRB.CreateOr(OrShadow, getShadow(&I, 0));
 
     setShadow(&I, OrShadow);
     setOriginForNaryOp(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index bdbd02338040b..a0341c67b1365 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -1702,12 +1702,12 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <2 x i16> [[TMP26]] to i32
 ; CHECK-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP27]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
+; CHECK-NEXT:    [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <1 x i64> [[TMP33]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP34]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3339,14 +3339,14 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i8> [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
-; CHECK-NEXT:    [[TMP30:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i8> [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
+; CHECK-NEXT:    [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <1 x i64> [[TMP36]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP28]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index 486b3bf1e411d..e0e75a91cbd2e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1745,12 +1745,12 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <2 x i16> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i16> [[TMP27]] to i32
 ; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP29]] to <1 x i64>
+; CHECK-NEXT:    [[TMP34:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <1 x i64> [[TMP34]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP35]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3425,14 +3425,14 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP29:%.*]] = or <4 x i8> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i8> [[TMP29]] to i32
-; CHECK-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP30]] to i64
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i64 [[TMP31]] to <1 x i64>
-; CHECK-NEXT:    [[TMP32:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i8> [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i64 [[TMP30]] to <1 x i64>
+; CHECK-NEXT:    [[TMP31:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[TMP32]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP31]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 3a161f5bb3ac4..21996b13a9961 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -58,11 +58,11 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i8> [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x i8> [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
-; CHECK-NEXT:    store <1 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <1 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[C]]
 ;
 entry:

>From 49ed5ef06fcf550a9099ecc0a7e7724d9012dc98 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 05:47:45 +0000
Subject: [PATCH 14/18] clang-format

---
 .../Instrumentation/MemorySanitizer.cpp       | 166 +++++++++++-------
 1 file changed, 98 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c53d732ac9870..affdcfa161bc9 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3626,7 +3626,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // Get an MMX-sized (64-bit) vector type, or optionally, other sized
   // vectors.
-  Type *getMMXVectorTy(unsigned EltSizeInBits, unsigned X86_MMXSizeInBits = 64) {
+  Type *getMMXVectorTy(unsigned EltSizeInBits,
+                       unsigned X86_MMXSizeInBits = 64) {
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
     return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3833,8 +3834,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
   //
   //       Three operands:
-  //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
-  //       (the result of multiply-add'ing %a and %b is accumulated with %s)
+  //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %s, <4 x i32> %a,
+  //       <4 x i32> %b) (the result of multiply-add'ing %a and %b is
+  //       accumulated with %s)
   void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
                                   unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
@@ -3869,16 +3871,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S2 = getShadow(&I, 1);
 
     if (EltSizeInBits) {
-        if (I.arg_size() != 3)
-          ReturnType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits * ReductionFactor, ReturnType->getPrimitiveSizeInBits()));
+      if (I.arg_size() != 3)
+        ReturnType = cast<FixedVectorType>(
+            getMMXVectorTy(EltSizeInBits * ReductionFactor,
+                           ReturnType->getPrimitiveSizeInBits()));
 
-        ParamType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+      ParamType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
-        V1 = IRB.CreateBitCast(V1, ParamType);
-        V2 = IRB.CreateBitCast(V2, ParamType);
+      V1 = IRB.CreateBitCast(V1, ParamType);
+      V2 = IRB.CreateBitCast(V2, ParamType);
 
-        S1 = IRB.CreateBitCast(S1, getShadowTy(ParamType));
-        S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
+      S1 = IRB.CreateBitCast(S1, getShadowTy(ParamType));
+      S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
     }
 
     Value *S1S2 = IRB.CreateOr(S1, S2);
@@ -5472,64 +5477,78 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //   <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
-    //   < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
-    //   <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
-    //   < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
-    //   <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
-    //   <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
+    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4
+    //   x i32>, i8) < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>,
+    //   <16 x i16>, <8 x i32>, i8) <16 x i32>
+    //   @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>,
+    //   i16) < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x
+    //   i8>, <8 x i16>, i8) <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32
+    //   x i8>, <32 x i8>, <16 x i16>, i16) <32 x i16>
+    //   @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>,
+    //   i32)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
     case Intrinsic::x86_avx512_pmaddw_d_512:
     case Intrinsic::x86_avx512_pmaddubs_w_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
       break;
 
     // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
       break;
 
     // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
       break;
 
     // Multiply and Add Packed Signed and Unsigned Bytes
-    //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //   < 8 x i32> @llvm.x86.avx512.vpdpbusd.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
-    //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128(< 4 x i32>, < 4 x i32>, < 4 x
+    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpbusd.256(< 8 x i32>, < 8 x i32>,
+    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x
+    //   i32>, <16 x i32>)
     //
     // Multiply and Add Unsigned and Signed Bytes With Saturation
-    //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //   < 8 x i32> @llvm.x86.avx512.vpdpbusds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
-    //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128(< 4 x i32>, < 4 x i32>, < 4 x
+    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpbusds.256(< 8 x i32>, < 8 x i32>,
+    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x
+    //   i32>, <16 x i32>)
     //
-    //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 (< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //   < 8 x i32> @llvm.x86.avx2.vpdpbssd.256 (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 (< 4 x i32>, < 4 x i32>, < 4 x
+    //   i32>) < 8 x i32> @llvm.x86.avx2.vpdpbssd.256 (< 8 x i32>, < 8 x i32>, <
+    //   8 x i32>)
     //
-    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //   < 8 x i32> @llvm.x86.avx2.vpdpbssds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128(< 4 x i32>, < 4 x i32>, < 4 x
+    //   i32>) < 8 x i32> @llvm.x86.avx2.vpdpbssds.256(< 8 x i32>, < 8 x i32>, <
+    //   8 x i32>)
     //
-    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-    //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x
+    //   i32>) <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>,
+    //   <16 x i32>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x
+    //   i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x
+    //   i32>, <4 x i32>, i8) <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x
+    //   i32>, <8 x i32>, <8 x i32>, i8) <8 x i32>
+    //   @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>,
+    //   i8) <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x
+    //   i32>, <16 x i32>, i16) <16 x i32>
+    //   @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>,
+    //   i16)
     //
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4
+    //   x i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>,
+    //   <4 x i32>, <4 x i32>, i8) <8 x i32>
+    //   @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>,
+    //   i8) <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x
+    //   i32>, <8 x i32>, i8) <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16
+    //   x i32>, <16 x i32>, <16 x i32>, i16) <16 x i32>
+    //   @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x
+    //   i32>, i16)
     case Intrinsic::x86_avx512_vpdpbusd_128:
     case Intrinsic::x86_avx512_vpdpbusd_256:
     case Intrinsic::x86_avx512_vpdpbusd_512:
@@ -5542,47 +5561,58 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_vpdpbssds_256:
     case Intrinsic::x86_avx10_vpdpbssd_512:
     case Intrinsic::x86_avx10_vpdpbssds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 4, /*EltSize=*/ 8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
       break;
 
     // Multiply and Add Signed Word Integers
-    //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
-    //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128(< 4 x i32>, < 4 x i32>, < 4 x
+    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpwssd.256(< 8 x i32>, < 8 x i32>,
+    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x
+    //   i32>, <16 x i32>)
     //
     // Multiply and Add Signed Word Integers With Saturation
-    //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
-    //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
-    //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128(< 4 x i32>, < 4 x i32>, < 4 x
+    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpwssds.256(< 8 x i32>, < 8 x i32>,
+    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x
+    //   i32>, <16 x i32>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x
+    //   i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x
+    //   i32>, <4 x i32>, i8) <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x
+    //   i32>, <8 x i32>, <8 x i32>, i8) <8 x i32>
+    //   @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>,
+    //   i8) <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x
+    //   i32>, <16 x i32>, i16) <16 x i32>
+    //   @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>,
+    //   i16)
     //
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4
+    //   x i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>,
+    //   <4 x i32>, <4 x i32>, i8) <8 x i32>
+    //   @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>,
+    //   i8) <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x
+    //   i32>, <8 x i32>, i8) <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16
+    //   x i32>, <16 x i32>, <16 x i32>, i16) <16 x i32>
+    //   @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x
+    //   i32>, i16)
     case Intrinsic::x86_avx512_vpdpwssd_128:
     case Intrinsic::x86_avx512_vpdpwssd_256:
     case Intrinsic::x86_avx512_vpdpwssd_512:
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
       break;
 
-    // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single Precision
-    //   <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>)
-    //   <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>)
-    //   <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>)
-    // handleVectorPmaddIntrinsic() currently only handles integer types.
+      // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
+      // Precision
+      //   <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x
+      //   bfloat>, <8 x bfloat>) <8 x float>
+      //   @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x
+      //   bfloat>) <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>,
+      //   <32 x bfloat>, <32 x bfloat>)
+      // handleVectorPmaddIntrinsic() currently only handles integer types.
 
     case Intrinsic::x86_sse_cmp_ss:
     case Intrinsic::x86_sse2_cmp_sd:

>From fffedf3aa626e00da0ed09195d8397ee492b2705 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 06:01:04 +0000
Subject: [PATCH 15/18] Move assertion earlier

---
 .../Transforms/Instrumentation/MemorySanitizer.cpp  | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index affdcfa161bc9..ceeeef2426237 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3834,9 +3834,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //         <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
   //
   //       Three operands:
-  //       <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %s, <4 x i32> %a,
-  //       <4 x i32> %b) (the result of multiply-add'ing %a and %b is
-  //       accumulated with %s)
+  //         <4 x i32> @llvm.x86.avx512.vpdpbusd.128
+  //                       (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+  //         (the result of multiply-add'ing %a and %b is accumulated with %s)
   void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
                                   unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
@@ -3886,6 +3886,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
     }
 
+    assert(ParamType->getNumElements() ==
+           ReturnType->getNumElements() * ReductionFactor);
+
     Value *S1S2 = IRB.CreateOr(S1, S2);
 
     // Multiplying an uninitialized / element by zero results in an initialized
@@ -3904,10 +3907,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //                <16 x i8> into <4 x i8>  (reduction factor == 4)
     //
     // Handle it similarly to handlePairwiseShadowOrIntrinsic().
-
-    assert(ParamType->getNumElements() ==
-           ReturnType->getNumElements() * ReductionFactor);
-
     unsigned TotalNumElems = ParamType->getNumElements();
     Value *OrShadow = nullptr;
     for (unsigned i = 0; i < ReductionFactor; i++) {

>From e63de12b3ab9a4d690513b6ffe9268190e5931e8 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 06:07:04 +0000
Subject: [PATCH 16/18] Revert "clang-format"

This reverts commit 49ed5ef06fcf550a9099ecc0a7e7724d9012dc98.
---
 .../Instrumentation/MemorySanitizer.cpp       | 161 +++++++-----------
 1 file changed, 66 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index ceeeef2426237..626b86cd30204 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3626,8 +3626,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // Get an MMX-sized (64-bit) vector type, or optionally, other sized
   // vectors.
-  Type *getMMXVectorTy(unsigned EltSizeInBits,
-                       unsigned X86_MMXSizeInBits = 64) {
+  Type *getMMXVectorTy(unsigned EltSizeInBits, unsigned X86_MMXSizeInBits = 64) {
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
     return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3871,19 +3870,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S2 = getShadow(&I, 1);
 
     if (EltSizeInBits) {
-      if (I.arg_size() != 3)
-        ReturnType = cast<FixedVectorType>(
-            getMMXVectorTy(EltSizeInBits * ReductionFactor,
-                           ReturnType->getPrimitiveSizeInBits()));
+        if (I.arg_size() != 3)
+          ReturnType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits * ReductionFactor, ReturnType->getPrimitiveSizeInBits()));
 
-      ParamType = cast<FixedVectorType>(
-          getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+        ParamType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
-      V1 = IRB.CreateBitCast(V1, ParamType);
-      V2 = IRB.CreateBitCast(V2, ParamType);
+        V1 = IRB.CreateBitCast(V1, ParamType);
+        V2 = IRB.CreateBitCast(V2, ParamType);
 
-      S1 = IRB.CreateBitCast(S1, getShadowTy(ParamType));
-      S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
+        S1 = IRB.CreateBitCast(S1, getShadowTy(ParamType));
+        S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
     }
 
     assert(ParamType->getNumElements() ==
@@ -5476,78 +5472,64 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //   <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4
-    //   x i32>, i8) < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>,
-    //   <16 x i16>, <8 x i32>, i8) <16 x i32>
-    //   @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>,
-    //   i16) < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x
-    //   i8>, <8 x i16>, i8) <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32
-    //   x i8>, <32 x i8>, <16 x i16>, i16) <32 x i16>
-    //   @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>,
-    //   i32)
+    //   < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
+    //   < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
+    //   < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
+    //   <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
+    //   <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
     case Intrinsic::x86_sse2_pmadd_wd:
     case Intrinsic::x86_avx2_pmadd_wd:
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
     case Intrinsic::x86_avx512_pmaddw_d_512:
     case Intrinsic::x86_avx512_pmaddubs_w_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2);
       break;
 
     // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 8);
       break;
 
     // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 16);
       break;
 
     // Multiply and Add Packed Signed and Unsigned Bytes
-    //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128(< 4 x i32>, < 4 x i32>, < 4 x
-    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpbusd.256(< 8 x i32>, < 8 x i32>,
-    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x
-    //   i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusd.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     // Multiply and Add Unsigned and Signed Bytes With Saturation
-    //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128(< 4 x i32>, < 4 x i32>, < 4 x
-    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpbusds.256(< 8 x i32>, < 8 x i32>,
-    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x
-    //   i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpbusds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
-    //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 (< 4 x i32>, < 4 x i32>, < 4 x
-    //   i32>) < 8 x i32> @llvm.x86.avx2.vpdpbssd.256 (< 8 x i32>, < 8 x i32>, <
-    //   8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssd.256 (< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //
-    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128(< 4 x i32>, < 4 x i32>, < 4 x
-    //   i32>) < 8 x i32> @llvm.x86.avx2.vpdpbssds.256(< 8 x i32>, < 8 x i32>, <
-    //   8 x i32>)
+    //   < 4 x i32> @llvm.x86.avx2.vpdpbssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpbssds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
     //
-    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x
-    //   i32>) <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>,
-    //   <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x
-    //   i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x
-    //   i32>, <4 x i32>, i8) <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x
-    //   i32>, <8 x i32>, <8 x i32>, i8) <8 x i32>
-    //   @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>,
-    //   i8) <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x
-    //   i32>, <16 x i32>, i16) <16 x i32>
-    //   @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>,
-    //   i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
     //
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4
-    //   x i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>,
-    //   <4 x i32>, <4 x i32>, i8) <8 x i32>
-    //   @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>,
-    //   i8) <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x
-    //   i32>, <8 x i32>, i8) <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16
-    //   x i32>, <16 x i32>, <16 x i32>, i16) <16 x i32>
-    //   @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x
-    //   i32>, i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
     case Intrinsic::x86_avx512_vpdpbusd_128:
     case Intrinsic::x86_avx512_vpdpbusd_256:
     case Intrinsic::x86_avx512_vpdpbusd_512:
@@ -5560,58 +5542,47 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_vpdpbssds_256:
     case Intrinsic::x86_avx10_vpdpbssd_512:
     case Intrinsic::x86_avx10_vpdpbssds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 4, /*EltSize=*/ 8);
       break;
 
     // Multiply and Add Signed Word Integers
-    //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128(< 4 x i32>, < 4 x i32>, < 4 x
-    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpwssd.256(< 8 x i32>, < 8 x i32>,
-    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x
-    //   i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     // Multiply and Add Signed Word Integers With Saturation
-    //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128(< 4 x i32>, < 4 x i32>, < 4 x
-    //   i32>) < 8 x i32> @llvm.x86.avx512.vpdpwssds.256(< 8 x i32>, < 8 x i32>,
-    //   < 8 x i32>) <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x
-    //   i32>, <16 x i32>)
+    //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128(< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256(< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x
-    //   i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x
-    //   i32>, <4 x i32>, i8) <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x
-    //   i32>, <8 x i32>, <8 x i32>, i8) <8 x i32>
-    //   @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>,
-    //   i8) <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x
-    //   i32>, <16 x i32>, i16) <16 x i32>
-    //   @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>,
-    //   i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
     //
-    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4
-    //   x i32>, i8) <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>,
-    //   <4 x i32>, <4 x i32>, i8) <8 x i32>
-    //   @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>,
-    //   i8) <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x
-    //   i32>, <8 x i32>, i8) <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16
-    //   x i32>, <16 x i32>, <16 x i32>, i16) <16 x i32>
-    //   @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x
-    //   i32>, i16)
+    //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
     case Intrinsic::x86_avx512_vpdpwssd_128:
     case Intrinsic::x86_avx512_vpdpwssd_256:
     case Intrinsic::x86_avx512_vpdpwssd_512:
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/ 2, /*EltSize=*/ 16);
       break;
 
-      // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
-      // Precision
-      //   <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x
-      //   bfloat>, <8 x bfloat>) <8 x float>
-      //   @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x
-      //   bfloat>) <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>,
-      //   <32 x bfloat>, <32 x bfloat>)
-      // handleVectorPmaddIntrinsic() currently only handles integer types.
+    // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single Precision
+    //   <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+    //   <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>)
+    //   <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>)
+    // handleVectorPmaddIntrinsic() currently only handles integer types.
 
     case Intrinsic::x86_sse_cmp_ss:
     case Intrinsic::x86_sse2_cmp_sd:

>From f6cdda23bab8715116a5256aa5a078fd151e54a7 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 06:09:48 +0000
Subject: [PATCH 17/18] Format

---
 .../Instrumentation/MemorySanitizer.cpp         | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 626b86cd30204..565592c7f4ab1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3870,16 +3870,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S2 = getShadow(&I, 1);
 
     if (EltSizeInBits) {
-        if (I.arg_size() != 3)
-          ReturnType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits * ReductionFactor, ReturnType->getPrimitiveSizeInBits()));
+      if (I.arg_size() != 3)
+        ReturnType = cast<FixedVectorType>(
+            getMMXVectorTy(EltSizeInBits * ReductionFactor,
+                           ReturnType->getPrimitiveSizeInBits()));
 
-        ParamType = cast<FixedVectorType>(getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
+      ParamType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
-        V1 = IRB.CreateBitCast(V1, ParamType);
-        V2 = IRB.CreateBitCast(V2, ParamType);
+      V1 = IRB.CreateBitCast(V1, ParamType);
+      V2 = IRB.CreateBitCast(V2, ParamType);
 
-        S1 = IRB.CreateBitCast(S1, getShadowTy(ParamType));
-        S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
+      S1 = IRB.CreateBitCast(S1, getShadowTy(ParamType));
+      S2 = IRB.CreateBitCast(S2, getShadowTy(ParamType));
     }
 
     assert(ParamType->getNumElements() ==

>From fd4e639478d0e769eeb192585dba03edb81a43b3 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 11 Aug 2025 06:52:13 +0000
Subject: [PATCH 18/18] Refactor into horizontalReduce

---
 .../Instrumentation/MemorySanitizer.cpp       | 108 ++++++++----------
 1 file changed, 49 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 565592c7f4ab1..9e65b616f1d67 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2690,6 +2690,41 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     SC.Done(&I);
   }
 
+  // Perform a bitwise OR on the horizontal pairs (or other specified grouping)
+  // of elements. This is convenient for instrumenting horizontal add/sub.
+  Value *horizontalReduce(IntrinsicInst &I, unsigned ReductionFactor,
+                          Value *VectorA, Value* VectorB) {
+    IRBuilder<> IRB(&I);
+    assert(isa<FixedVectorType>(VectorA->getType()));
+    unsigned TotalNumElems = cast<FixedVectorType>(VectorA->getType())->getNumElements();
+
+    if (VectorB) {
+      assert(VectorA->getType() == VectorB->getType());
+      TotalNumElems = TotalNumElems * 2;
+    }
+
+    Value *Or = nullptr;
+
+    for (unsigned i = 0; i < ReductionFactor; i++) {
+      SmallVector<int, 16> Mask;
+      for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
+        Mask.push_back(X + i);
+
+      Value *Masked;
+      if (VectorB)
+        Masked = IRB.CreateShuffleVector(VectorA, VectorB, Mask);
+      else
+        Masked = IRB.CreateShuffleVector(VectorA, Mask);
+
+      if (Or)
+        Or = IRB.CreateOr(Or, Masked);
+      else
+        Or = Masked;
+    }
+
+    return Or;
+  }
+
   /// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent
   /// fields.
   ///
@@ -2711,31 +2746,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
            2 * ReturnType->getNumElements());
 
     IRBuilder<> IRB(&I);
-    unsigned Width = ParamType->getNumElements() * I.arg_size();
 
     // Horizontal OR of shadow
-    SmallVector<int, 8> EvenMask;
-    SmallVector<int, 8> OddMask;
-    for (unsigned X = 0; X < Width; X += 2) {
-      EvenMask.push_back(X);
-      OddMask.push_back(X + 1);
-    }
-
     Value *FirstArgShadow = getShadow(&I, 0);
-    Value *EvenShadow;
-    Value *OddShadow;
-    if (I.arg_size() == 2) {
-      Value *SecondArgShadow = getShadow(&I, 1);
-      EvenShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
-      OddShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask);
-    } else {
-      EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask);
-      OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask);
-    }
+    Value *SecondArgShadow = nullptr;
+    if (I.arg_size() == 2)
+      SecondArgShadow = getShadow(&I, 1);
+
+    Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/ 2,
+                                       FirstArgShadow, SecondArgShadow);
 
-    Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
     OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
 
     setShadow(&I, OrShadow);
@@ -2768,23 +2788,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     IRBuilder<> IRB(&I);
 
-    unsigned TotalNumElems = ParamType->getNumElements() * I.arg_size();
     FixedVectorType *ReinterpretShadowTy = nullptr;
     assert(isAligned(Align(ReinterpretElemWidth),
                      ParamType->getPrimitiveSizeInBits()));
     ReinterpretShadowTy = FixedVectorType::get(
         IRB.getIntNTy(ReinterpretElemWidth),
         ParamType->getPrimitiveSizeInBits() / ReinterpretElemWidth);
-    TotalNumElems = ReinterpretShadowTy->getNumElements() * I.arg_size();
 
     // Horizontal OR of shadow
-    SmallVector<int, 8> EvenMask;
-    SmallVector<int, 8> OddMask;
-    for (unsigned X = 0; X < TotalNumElems - 1; X += 2) {
-      EvenMask.push_back(X);
-      OddMask.push_back(X + 1);
-    }
-
     Value *FirstArgShadow = getShadow(&I, 0);
     FirstArgShadow = IRB.CreateBitCast(FirstArgShadow, ReinterpretShadowTy);
 
@@ -2796,22 +2807,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         Align(2),
         cast<FixedVectorType>(FirstArgShadow->getType())->getNumElements()));
 
-    Value *EvenShadow;
-    Value *OddShadow;
+    Value *SecondArgShadow = nullptr;
     if (I.arg_size() == 2) {
-      Value *SecondArgShadow = getShadow(&I, 1);
-      SecondArgShadow = IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy);
-
-      EvenShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask);
-      OddShadow =
-          IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask);
-    } else {
-      EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask);
-      OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask);
+      SecondArgShadow = getShadow(&I, 1);
+      SecondArgShadow = IRB.CreateBitCast(SecondArgShadow,
+                                          ReinterpretShadowTy);
     }
 
-    Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow);
+    Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/ 2,
+                                       FirstArgShadow, SecondArgShadow);
+
     OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
 
     setShadow(&I, OrShadow);
@@ -3904,22 +3909,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Step 2: instrument horizontal add
     // e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
     //                <16 x i8> into <4 x i8>  (reduction factor == 4)
-    //
-    // Handle it similarly to handlePairwiseShadowOrIntrinsic().
-    unsigned TotalNumElems = ParamType->getNumElements();
-    Value *OrShadow = nullptr;
-    for (unsigned i = 0; i < ReductionFactor; i++) {
-      SmallVector<int, 16> Mask;
-      for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
-        Mask.push_back(X + i);
-
-      Value *MaskedShadow = IRB.CreateShuffleVector(S1S2, Mask);
-
-      if (OrShadow)
-        OrShadow = IRB.CreateOr(OrShadow, MaskedShadow);
-      else
-        OrShadow = MaskedShadow;
-    }
+    Value *OrShadow = horizontalReduce(I, ReductionFactor, S1S2, nullptr);
 
     // Extend to <4 x i32>.
     // For MMX, cast it back to <1 x i64>.