[llvm] [msan][test] Add some avx512bf16 tests (PR #166219)

Mon Nov 3 12:02:55 PST 2025

https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/166219

>From ab0ded8e1e4c97efd04a79b451b9b99939db1a48 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 3 Nov 2025 19:52:54 +0000
Subject: [PATCH 1/2] [msan][test] Add some avx512bf16 tests

Forked from llvm/test/CodeGen/X86
---
 .../X86/avx512bf16-intrinsics.ll              | 355 ++++++++
 .../MemorySanitizer/X86/avx512bf16-mov.ll     | 123 +++
 .../X86/avx512bf16-vl-intrinsics.ll           | 774 ++++++++++++++++++
 3 files changed, 1252 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
new file mode 100644
index 0000000000000..877fe5fe4b393
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3
+
+define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x bfloat> [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <32 x bfloat> %0 to <8 x i64>
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x bfloat> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP8]], <32 x i16> [[TMP14]], <32 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP9]], <32 x bfloat> [[TMP7]], <32 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x bfloat> [[TMP15]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast i32 %U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  %3 = bitcast <32 x bfloat> %2 to <8 x i64>
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(
+; CHECK-SAME: <8 x i64> [[C:%.*]], i32 [[U:%.*]], <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[C]] to <32 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> zeroinitializer, <32 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x bfloat> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x bfloat> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <32 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <32 x i1> [[TMP12]], <32 x bfloat> [[TMP8]], <32 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x bfloat> [[TMP19]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <8 x i64> %C to <32 x bfloat>
+  %2 = bitcast i32 %U to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1
+  %4 = bitcast <32 x bfloat> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3
+
+define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x bfloat> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], i16 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x bfloat> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i16> [[TMP12]], <16 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x bfloat> [[TMP5]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP13]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 [[U:%.*]], <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> zeroinitializer, <16 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <16 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> [[TMP16]], <16 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x bfloat> [[TMP6]], <16 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x bfloat> [[TMP17]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>) #3
+
+define <16 x float> @test_mm512_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  ret <16 x float> %0
+}
+
+define <16 x float> @test_mm512_maskz_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B, i16 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
+  ret <16 x float> %2
+}
+define <16 x float> @test_mm512_mask_dpbf16ps_512(i16 zeroext %U, <16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpbf16ps_512(
+; CHECK-SAME: i16 zeroext [[U:%.*]], <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[E]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[E]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %E
+  ret <16 x float> %2
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
new file mode 100644
index 0000000000000..ac65645a9ec2c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-mov.ll
+;
+; Strictly handled: (none)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) sanitize_memory {
+; CHECK-LABEL: define dso_local void @funbf16(
+; CHECK-SAME: ptr readonly [[SRC:%.*]], ptr writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    store <8 x bfloat> [[TMP4]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 87960930222080
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 32
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB19:.*]], label %[[BB20:.*]], !prof [[PROF1]]
+; CHECK:       [[BB19]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB20]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP23]], align 32
+; CHECK-NEXT:    store <8 x bfloat> [[TMP15]], ptr [[DST]], align 32
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP7]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i16>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP8]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB31]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 87960930222080
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD2]], ptr [[TMP34]], align 1
+; CHECK-NEXT:    store <16 x bfloat> [[TMP26]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]]
+; CHECK:       [[BB35]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB36]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
+; CHECK-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
+; CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i16>, ptr [[TMP40]], align 32
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP10]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF1]]
+; CHECK:       [[BB41]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    [[TMP43:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = xor i64 [[TMP43]], 87960930222080
+; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD3]], ptr [[TMP45]], align 32
+; CHECK-NEXT:    store <16 x bfloat> [[TMP37]], ptr [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x bfloat>, ptr %src, align 1
+  store <8 x bfloat> %0, ptr %dst, align 1
+  %1 = load <8 x bfloat>, ptr %src, align 32
+  store <8 x bfloat> %1, ptr %dst, align 32
+  %2 = load <16 x bfloat>, ptr %src, align 1
+  store <16 x bfloat> %2, ptr %dst, align 1
+  %3 = load <16 x bfloat>, ptr %src, align 32
+  store <16 x bfloat> %3, ptr %dst, align 32
+  ret void
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
new file mode 100644
index 0000000000000..9c0f19ae36e98
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
@@ -0,0 +1,774 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 -mattr=+avx512vl < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B)
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B)
+; - llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B)
+; - llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %6, <4 x i1> %4)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1
+
+define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i16> [[TMP14]], <8 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP9]], <8 x bfloat> [[TMP7]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP15]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i16> zeroinitializer, <8 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP18]], <8 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP12]], <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3
+
+define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x bfloat> [[TMP7]] to <16 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i16> [[TMP14]], <16 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP9]], <16 x bfloat> [[TMP7]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x bfloat> [[TMP15]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 zeroext [[U:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i16> zeroinitializer, <16 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i16> [[TMP18]], <16 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP12]], <16 x bfloat> [[TMP8]], <16 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x bfloat> [[TMP19]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3
+
+define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x bfloat> [[TMP5]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i16> zeroinitializer, <8 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP16]], <8 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP10]], <8 x bfloat> [[TMP6]], <8 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> undef, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> zeroinitializer, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP9]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP10]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> [[TMP7]], <4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP14]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <2 x i64> %C to <8 x bfloat>
+  %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128_select(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> undef, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP13]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP4]], <8 x bfloat> [[TMP7]], <8 x bfloat> [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x bfloat> [[TMP14]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP16]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3
+
+define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  ret <8 x float> %0
+}
+
+define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
+  ret <8 x float> %2
+}
+define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpbf16ps_256(
+; CHECK-SAME: i8 zeroext [[U:%.*]], <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x float> [[E]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP17]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> [[E]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E
+  ret <8 x float> %2
+}
+
+declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3
+
+define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  ret <4 x float> %0
+}
+
+define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_maskz_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], i4 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP16]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_mask_dpbf16ps_128(
+; CHECK-SAME: i4 zeroext [[U:%.*]], <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x float> [[E]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP17]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> [[E]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E
+  ret <4 x float> %2
+}
+
+define <16 x i16> @test_no_vbroadcast1() sanitize_memory {
+; CHECK-LABEL: define <16 x i16> @test_no_vbroadcast1(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = bitcast <8 x bfloat> %0 to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> zeroinitializer
+  ret <16 x i16> %2
+}
+
+define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory {
+; CHECK-LABEL: define <16 x bfloat> @test_no_vbroadcast2(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x bfloat> [[TMP1]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %1
+}
+
+define <16 x i32> @pr83358() sanitize_memory {
+; CHECK-LABEL: define <16 x i32> @pr83358(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+  %2 = bitcast <8 x bfloat> %1 to <4 x i32>
+  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %3
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.

>From 87ad85351836eb066259210f0e62c16e779865d7 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 3 Nov 2025 19:58:41 +0000
Subject: [PATCH 2/2] Replace undef with poison

---
 .../X86/avx512bf16-vl-intrinsics.ll           | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
index 9c0f19ae36e98..904614e961d6c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
@@ -364,13 +364,13 @@ define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> undef, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 ;
 entry:
-  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
   %1 = bitcast <8 x bfloat> %0 to <2 x i64>
   ret <2 x i64> %1
 }
@@ -385,7 +385,7 @@ define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
@@ -403,7 +403,7 @@ define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %
 ;
 entry:
   %0 = bitcast i8 %U to <8 x i1>
-  %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4
   %3 = bitcast <8 x bfloat> %2 to <2 x i64>
   ret <2 x i64> %3
@@ -420,7 +420,7 @@ define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U,
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -443,7 +443,7 @@ define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U,
 ;
 entry:
   %0 = bitcast i8 %U to <8 x i1>
-  %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = bitcast <2 x i64> %C to <8 x bfloat>
   %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4
   %4 = bitcast <8 x bfloat> %3 to <2 x i64>
@@ -463,7 +463,7 @@ define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> undef, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
@@ -480,7 +480,7 @@ define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U
 entry:
   %0 = bitcast i8 %U to <8 x i1>
   %1 = bitcast <2 x i64> %C to <8 x bfloat>
-  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
   %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1
   %4 = bitcast <8 x bfloat> %3 to <2 x i64>
   ret <2 x i64> %4
@@ -726,14 +726,14 @@ define <16 x i16> @test_no_vbroadcast1() sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
 ;
 entry:
   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   %1 = bitcast <8 x bfloat> %0 to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> zeroinitializer
+  %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <16 x i32> zeroinitializer
   ret <16 x i16> %2
 }
 
@@ -744,13 +744,13 @@ define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x bfloat> [[TMP1]]
 ;
 entry:
   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-  %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
+  %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> poison, <16 x i32> zeroinitializer
   ret <16 x bfloat> %1
 }
 
@@ -760,13 +760,13 @@ define <16 x i32> @pr83358() sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
   %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
   %2 = bitcast <8 x bfloat> %1 to <4 x i32>
-  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   ret <16 x i32> %3
 }
 ;.