[llvm] [msan] Approximately handle AVX Galois Field Affine Transformation (PR #150794)

Tue Jul 29 19:13:21 PDT 2025

https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/150794

>From 1c8915a918941cbc0fff0faba8cea99647f54d13 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Sat, 26 Jul 2025 19:51:37 +0000
Subject: [PATCH 1/4] [msan] Approximately handle AVX Galois Field Affine
 Transformation

e.g.,
      <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
      <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
      <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
       Out                                    A          x          b
  where Out = A * x + b in GF(2) (but A and x are packed)

Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix computation also includes a parity calculation.

For the bitwise AND of bits V1 and V2, the exact shadow is:
  Out_Shadow = (V1_Shadow & V2_Shadow) | (V1 & V2_Shadow) | (V1_Shadow & V2_Shadow)

We approximate the shadow of gf2p8affine using:
  Out_Shadow =   _mm512_gf2p8affine_epi64_epi8(x_Shadow, A_shadow, 0)
               | _mm512_gf2p8affine_epi64_epi8(x, A_shadow, 0)
               | _mm512_gf2p8affine_epi64_epi8(x_Shadow, A, 0)
               | _mm512_set1_epi8(b_Shadow)

This approximation has false negatives: if an intermediate dot-product contains an even number of 1's, the parity is 0.

It has no false positives.

Updates the test from https://github.com/llvm/llvm-project/pull/149258
---
 .../Instrumentation/MemorySanitizer.cpp       |  80 ++++++++
 .../X86/avx512-gfni-intrinsics.ll             | 186 +++++++-----------
 2 files changed, 155 insertions(+), 111 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index df31f07097f82..324c7f22253fd 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4769,6 +4769,78 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // Approximately handle AVX Galois Field Affine Transformation
+  //
+  // e.g.,
+  //     <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+  //     <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+  //     <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+  //      Out                                    A          x          b
+  // where Out = A * x + b in GF(2) (but A and x are packed)
+  //
+  // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
+  // computation also includes a parity calculation.
+  //
+  // For the bitwise AND of bits V1 and V2, the exact shadow is:
+  //     Out_Shadow =   (V1_Shadow & V2_Shadow)
+  //                  | (V1 & V2_Shadow)
+  //                  | (V1_Shadow & V2_Shadow)
+  //
+  // We approximate the shadow of gf2p8affine using:
+  //     Out_Shadow =   _mm512_gf2p8affine_epi64_epi8(x_Shadow, A_shadow, 0)
+  //                  | _mm512_gf2p8affine_epi64_epi8(x, A_shadow, 0)
+  //                  | _mm512_gf2p8affine_epi64_epi8(x_Shadow, A, 0)
+  //                  | _mm512_set1_epi8(b_Shadow)
+  //
+  // This approximation has false negatives: if an intermediate dot-product
+  // contains an even number of 1's, the parity is 0.
+  // It has no false positives.
+  void handleAVXGF2P8Affine(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+
+    assert(I.arg_size() == 3);
+    Value *A = I.getOperand(0);
+    Value *x = I.getOperand(1);
+    Value *b = I.getOperand(2);
+
+    assert(isFixedIntVector(A));
+    assert(cast<VectorType>(A->getType())
+               ->getElementType()
+               ->getScalarSizeInBits() == 8);
+
+    assert(A->getType() == x->getType());
+
+    assert(b->getType()->isIntegerTy());
+    assert(b->getType()->getScalarSizeInBits() == 8);
+
+    assert(I.getType() == A->getType());
+
+    Value *AShadow = getShadow(A);
+    Value *xShadow = getShadow(x);
+    Value *bZeroShadow = getCleanShadow(b);
+
+    CallInst *xShadowAShadow = IRB.CreateIntrinsic(
+        I.getType(), I.getIntrinsicID(), {xShadow, AShadow, bZeroShadow});
+    CallInst *xAShadow = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+                                             {x, AShadow, bZeroShadow});
+    CallInst *xShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+                                             {xShadow, A, bZeroShadow});
+
+    unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements();
+    Value *bShadow = getShadow(b);
+    Value *bBroadcastShadow = getCleanShadow(AShadow);
+    // There is no LLVM IR intrinsic for _mm512_set1_epi8.
+    // This loop generates a lot of LLVM IR, which we expect that CodeGen will
+    // lower appropriately (e.g., VPBROADCASTB).
+    // Besides, b is often a constant, in which case it is fully initialized.
+    for (unsigned i = 0; i < NumElements; i++)
+      bBroadcastShadow = IRB.CreateInsertElement(bBroadcastShadow, bShadow, i);
+
+    setShadow(&I, IRB.CreateOr(
+                      {xShadowAShadow, xAShadow, xShadowA, bBroadcastShadow}));
+    setOriginForNaryOp(I);
+  }
+
   // Handle Arm NEON vector load intrinsics (vld*).
   //
   // The WithLane instructions (ld[234]lane) are similar to:
@@ -5604,6 +5676,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
+    // AVX Galois Field New Instructions
+    case Intrinsic::x86_vgf2p8affineqb_128:
+    case Intrinsic::x86_vgf2p8affineqb_256:
+    case Intrinsic::x86_vgf2p8affineqb_512: {
+      handleAVXGF2P8Affine(I);
+      break;
+    }
+
     case Intrinsic::fshl:
     case Intrinsic::fshr:
       handleFunnelShift(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
index e5e4371c525b2..43da02d19693c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
@@ -7,9 +7,6 @@
 ; - llvm.x86.vgf2p8affineinvqb.128
 ; - llvm.x86.vgf2p8affineinvqb.256
 ; - llvm.x86.vgf2p8affineinvqb.512
-; - llvm.x86.vgf2p8affineqb.128
-; - llvm.x86.vgf2p8affineqb.256
-; - llvm.x86.vgf2p8affineqb.512
 ;
 ; Heuristically handled:
 ; - llvm.x86.vgf2p8mulb.128
@@ -254,53 +251,42 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3)
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2:%.*]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1:%.*]], i8 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i8> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 3)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i8> [[TMP17]], [[TMP20]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i8> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
-; CHECK:       19:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       20:
+; CHECK-NEXT:    [[TMP37:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT:    [[TMP41:%.*]] = or <16 x i8> [[TMP37]], [[TMP39]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or <16 x i8> [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i8> [[TMP42]], zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5)
-; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP19]], <16 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i8> [[TMP23]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP43]], <16 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i8> [[TMP28]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP38:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP38]], <16 x i8> [[_MSPROP_SELECT]], 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2
@@ -329,53 +315,42 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3)
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2:%.*]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1:%.*]], i8 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i8> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 3)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i8> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i8> [[TMP17]], [[TMP20]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <32 x i8> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
-; CHECK:       19:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       20:
+; CHECK-NEXT:    [[TMP37:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP40:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT:    [[TMP41:%.*]] = or <32 x i8> [[TMP37]], [[TMP39]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or <32 x i8> [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <32 x i8> [[TMP42]], zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5)
-; CHECK-NEXT:    [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP19]], <32 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <32 x i8> [[TMP23]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP43]], <32 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = or <32 x i8> [[TMP28]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP38:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP38]], <32 x i8> [[_MSPROP_SELECT]], 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2
@@ -404,53 +379,42 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3)
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2:%.*]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[SRC1:%.*]], i8 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i8> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i8> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 3)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i8> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <64 x i8> [[TMP17]], [[TMP20]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
-; CHECK:       19:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       20:
+; CHECK-NEXT:    [[TMP37:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP40:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT:    [[TMP41:%.*]] = or <64 x i8> [[TMP37]], [[TMP39]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or <64 x i8> [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <64 x i8> [[TMP42]], zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5)
-; CHECK-NEXT:    [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP19]], <64 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <64 x i8> [[TMP23]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP43]], <64 x i8> [[TMP4]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = or <64 x i8> [[TMP28]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP38:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP38]], <64 x i8> [[_MSPROP_SELECT]], 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2

>From f6f7659f7f0a0eb386288e46c447587d5298267f Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Sat, 26 Jul 2025 21:26:00 +0000
Subject: [PATCH 2/4] Touch up comment

---
 .../Transforms/Instrumentation/MemorySanitizer.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 324c7f22253fd..077ca7e8cc021 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4776,21 +4776,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //     <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
   //     <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
   //      Out                                    A          x          b
-  // where Out = A * x + b in GF(2) (but A and x are packed)
+  // where Out = A * x + b in GF(2) (N.B. Out, A and x are packed)
   //
   // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
   // computation also includes a parity calculation.
   //
   // For the bitwise AND of bits V1 and V2, the exact shadow is:
   //     Out_Shadow =   (V1_Shadow & V2_Shadow)
-  //                  | (V1 & V2_Shadow)
+  //                  | (V1        & V2_Shadow)
   //                  | (V1_Shadow & V2_Shadow)
   //
-  // We approximate the shadow of gf2p8affine using:
-  //     Out_Shadow =   _mm512_gf2p8affine_epi64_epi8(x_Shadow, A_shadow, 0)
-  //                  | _mm512_gf2p8affine_epi64_epi8(x, A_shadow, 0)
-  //                  | _mm512_gf2p8affine_epi64_epi8(x_Shadow, A, 0)
-  //                  | _mm512_set1_epi8(b_Shadow)
+  // We approximate the shadow of gf2p8affineqb using:
+  //     Out_Shadow =   gf2p8affineqb(x_Shadow, A_shadow, 0)
+  //                  | gf2p8affineqb(x,        A_shadow, 0)
+  //                  | gf2p8affineqb(x_Shadow, A,        0)
+  //                  | set1_epi8(b_Shadow)
   //
   // This approximation has false negatives: if an intermediate dot-product
   // contains an even number of 1's, the parity is 0.

>From 73dc3173a39fcc22efb641a957fd559de769c2c2 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Tue, 29 Jul 2025 03:25:04 +0000
Subject: [PATCH 3/4] Improve comments and variable naming

---
 .../Instrumentation/MemorySanitizer.cpp       | 39 ++++++++++---------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 077ca7e8cc021..ed2015f5534dd 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4776,7 +4776,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //     <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
   //     <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
   //      Out                                    A          x          b
-  // where Out = A * x + b in GF(2) (N.B. Out, A and x are packed)
+  // where A and x are packed matrices, b is a vector,
+  //       Out = A * x + b in GF(2)
   //
   // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
   // computation also includes a parity calculation.
@@ -4784,7 +4785,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // For the bitwise AND of bits V1 and V2, the exact shadow is:
   //     Out_Shadow =   (V1_Shadow & V2_Shadow)
   //                  | (V1        & V2_Shadow)
-  //                  | (V1_Shadow & V2_Shadow)
+  //                  | (V1_Shadow & V2       )
   //
   // We approximate the shadow of gf2p8affineqb using:
   //     Out_Shadow =   gf2p8affineqb(x_Shadow, A_shadow, 0)
@@ -4800,44 +4801,44 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     assert(I.arg_size() == 3);
     Value *A = I.getOperand(0);
-    Value *x = I.getOperand(1);
-    Value *b = I.getOperand(2);
+    Value *X = I.getOperand(1);
+    Value *B = I.getOperand(2);
 
     assert(isFixedIntVector(A));
     assert(cast<VectorType>(A->getType())
                ->getElementType()
                ->getScalarSizeInBits() == 8);
 
-    assert(A->getType() == x->getType());
+    assert(A->getType() == X->getType());
 
-    assert(b->getType()->isIntegerTy());
-    assert(b->getType()->getScalarSizeInBits() == 8);
+    assert(B->getType()->isIntegerTy());
+    assert(B->getType()->getScalarSizeInBits() == 8);
 
     assert(I.getType() == A->getType());
 
     Value *AShadow = getShadow(A);
-    Value *xShadow = getShadow(x);
-    Value *bZeroShadow = getCleanShadow(b);
+    Value *XShadow = getShadow(X);
+    Value *BZeroShadow = getCleanShadow(B);
 
-    CallInst *xShadowAShadow = IRB.CreateIntrinsic(
-        I.getType(), I.getIntrinsicID(), {xShadow, AShadow, bZeroShadow});
-    CallInst *xAShadow = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
-                                             {x, AShadow, bZeroShadow});
-    CallInst *xShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
-                                             {xShadow, A, bZeroShadow});
+    CallInst *AShadowXShadow = IRB.CreateIntrinsic(
+        I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow});
+    CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+                                             {X, AShadow, BZeroShadow});
+    CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+                                             {XShadow, A, BZeroShadow});
 
     unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements();
-    Value *bShadow = getShadow(b);
-    Value *bBroadcastShadow = getCleanShadow(AShadow);
+    Value *BShadow = getShadow(B);
+    Value *BBroadcastShadow = getCleanShadow(AShadow);
     // There is no LLVM IR intrinsic for _mm512_set1_epi8.
     // This loop generates a lot of LLVM IR, which we expect that CodeGen will
     // lower appropriately (e.g., VPBROADCASTB).
     // Besides, b is often a constant, in which case it is fully initialized.
     for (unsigned i = 0; i < NumElements; i++)
-      bBroadcastShadow = IRB.CreateInsertElement(bBroadcastShadow, bShadow, i);
+      BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i);
 
     setShadow(&I, IRB.CreateOr(
-                      {xShadowAShadow, xAShadow, xShadowA, bBroadcastShadow}));
+                      {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow}));
     setOriginForNaryOp(I);
   }
 

>From 3dd17c7f3740d692c6e5004eb892f3eda5c4280d Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 30 Jul 2025 02:08:21 +0000
Subject: [PATCH 4/4] Teeth are straight, remove braces

---
 llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index ed2015f5534dd..54d9a832d0f66 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5680,10 +5680,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // AVX Galois Field New Instructions
     case Intrinsic::x86_vgf2p8affineqb_128:
     case Intrinsic::x86_vgf2p8affineqb_256:
-    case Intrinsic::x86_vgf2p8affineqb_512: {
+    case Intrinsic::x86_vgf2p8affineqb_512:
       handleAVXGF2P8Affine(I);
       break;
-    }
 
     case Intrinsic::fshl:
     case Intrinsic::fshr: