[llvm] [msan] Handle x86_avx512_(min|max)_p[sd]_512 intrinsics (PR #124421)
Thurston Dang via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 25 09:45:01 PST 2025
https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/124421
>From 8289e7fe1a90bb113eb911877809dcfb38a08398 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Sat, 25 Jan 2025 04:52:50 +0000
Subject: [PATCH 1/2] [msan] Handle x86_avx512_(min|max)_p[sd]_512 intrinsics
The AVX/SSE variants are already handled heuristically
(maybeHandleSimpleNomemIntrinsic via handleUnknownIntrinsic), but the AVX512
variants contain an additional parameter (the rounding method) which
fails to match heuristically. We generalize
maybeHandleSimpleNomemIntrinsic to allow additional flags (ignored by
MSan) and explicitly call it to handle AVX512 min/max ps/pd intrinsics.
---
.../Instrumentation/MemorySanitizer.cpp | 31 +++++++++++++++----
1 file changed, 25 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 56d3eb10d73e95..922b9094ef47f1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2989,17 +2989,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/// Handle (SIMD arithmetic)-like intrinsics.
///
- /// Instrument intrinsics with any number of arguments of the same type,
- /// equal to the return type. The type should be simple (no aggregates or
- /// pointers; vectors are fine).
+ /// Instrument intrinsics with any number of arguments of the same type [*],
+ /// equal to the return type, plus a specified number of trailing flags of
+ /// any type.
+ ///
+ /// [*} The type should be simple (no aggregates or pointers; vectors are
+ /// fine).
+ ///
/// Caller guarantees that this intrinsic does not access memory.
- bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+ bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I, unsigned int trailingFlags) {
Type *RetTy = I.getType();
if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy()))
return false;
unsigned NumArgOperands = I.arg_size();
- for (unsigned i = 0; i < NumArgOperands; ++i) {
+ assert(NumArgOperands >= trailingFlags);
+ for (unsigned i = 0; i < NumArgOperands - trailingFlags; ++i) {
Type *Ty = I.getArgOperand(i)->getType();
if (Ty != RetTy)
return false;
@@ -3043,7 +3048,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
if (I.doesNotAccessMemory())
- if (maybeHandleSimpleNomemIntrinsic(I))
+ if (maybeHandleSimpleNomemIntrinsic(I, /* trailingFlags */ 0))
return true;
// FIXME: detect and handle SSE maskstore/maskload
@@ -4466,6 +4471,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
+ // Packed
+ case Intrinsic::x86_avx512_min_ps_512:
+ case Intrinsic::x86_avx512_min_pd_512:
+ case Intrinsic::x86_avx512_max_ps_512:
+ case Intrinsic::x86_avx512_max_pd_512: {
+ // These AVX512 variants contain the rounding mode as a trailing flag.
+ // Earlier variants do not have a trailing flag and are already handled
+ // by maybeHandleSimpleNomemIntrinsic(I, 0) via handleUnknownIntrinsic.
+ bool success = maybeHandleSimpleNomemIntrinsic(I, /* trailingFlags */ 1);
+ (void)success;
+ assert(success);
+ break;
+ }
+
case Intrinsic::fshl:
case Intrinsic::fshr:
handleFunnelShift(I);
>From 8fd836fb72ae225be772d8b692ac2c6aaa8d5f21 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Sat, 25 Jan 2025 17:31:46 +0000
Subject: [PATCH 2/2] Test (not the correct base)
---
.../MemorySanitizer/avx512-intrinsics.ll | 13602 ++++++++++++++++
1 file changed, 13602 insertions(+)
create mode 100644 llvm/test/Instrumentation/MemorySanitizer/avx512-intrinsics.ll
diff --git a/llvm/test/Instrumentation/MemorySanitizer/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/avx512-intrinsics.ll
new file mode 100644
index 00000000000000..c24f9cf377cf62
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/avx512-intrinsics.ll
@@ -0,0 +1,13602 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s
+;
+; Forked from llvm/test/Instrumentation/MemorySanitizer/avx512-intrinsics.ll
+
+define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10:[0-9]+]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP11]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
+ ret <8 x double> %2
+}
+
+define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP9]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
+ ret <8 x double> %2
+}
+
+define <8 x double> @test_compress_pd_512(<8 x double> %data) #0 {
+; CHECK-LABEL: @test_compress_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP2]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <8 x double> %1
+}
+
+define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP11]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
+ ret <16 x float> %2
+}
+
+define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP9]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
+ ret <16 x float> %2
+}
+
+define <16 x float> @test_compress_ps_512(<16 x float> %data) #0 {
+; CHECK-LABEL: @test_compress_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP2]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <16 x float> %1
+}
+
+define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP11]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP9]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_compress_q_512(<8 x i64> %data) #0 {
+; CHECK-LABEL: @test_compress_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP2]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <8 x i64> %1
+}
+
+define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP11]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
+ ret <16 x i32> %2
+}
+
+define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP9]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
+ ret <16 x i32> %2
+}
+
+define <16 x i32> @test_compress_d_512(<16 x i32> %data) #0 {
+; CHECK-LABEL: @test_compress_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP2]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <16 x i32> %1
+}
+
+define <8 x double> @test_expand_pd_512(<8 x double> %data) #0 {
+; CHECK-LABEL: @test_expand_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP2]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <8 x double> %1
+}
+
+define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP11]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
+ ret <8 x double> %2
+}
+
+define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP9]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
+ ret <8 x double> %2
+}
+
+define <16 x float> @test_expand_ps_512(<16 x float> %data) #0 {
+; CHECK-LABEL: @test_expand_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP2]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP11]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
+ ret <16 x float> %2
+}
+
+define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP9]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
+ ret <16 x float> %2
+}
+
+define <8 x i64> @test_expand_q_512(<8 x i64> %data) #0 {
+; CHECK-LABEL: @test_expand_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP2]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP11]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP9]]
+;
+ %1 = bitcast i8 %mask to <8 x i1>
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
+ ret <8 x i64> %2
+}
+
+define <16 x i32> @test_expand_d_512(<16 x i32> %data) #0 {
+; CHECK-LABEL: @test_expand_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP2]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <16 x i32> %1
+}
+
+define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP11]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
+ ret <16 x i32> %2
+}
+
+define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP9]]
+;
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
+ ret <16 x i32> %2
+}
+
+define <16 x float> @test_rcp_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_rcp_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
+
+define <8 x double> @test_rcp_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_rcp_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
+ ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
+
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
+
+define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: @test_rndscale_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 11, i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_sd_mask(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x double> %c, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_sd_mask_load(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[B:%.*]] = load <2 x double>, ptr [[BPTR:%.*]], align 16
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[BPTR]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %b = load <2 x double>, ptr %bptr
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_sd_maskz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
+
+define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: @test_rndscale_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 11, i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr) #0 {
+; CHECK-LABEL: @test_rndscale_ss_load(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[B:%.*]] = load <4 x float>, ptr [[BPTR:%.*]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[BPTR]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B]], <4 x float> undef, i8 -1, i32 11, i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %b = load <4 x float>, ptr %bptr
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_ss_mask(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_ss_maskz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x double> @test7(<8 x double> %a) #0 {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> [[A:%.*]], i32 11, <8 x double> [[A]], i8 -1, i32 4)
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
+ ret <8 x double>%res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+
+define <16 x float> @test8(<16 x float> %a) #0 {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> [[A:%.*]], i32 11, <16 x float> [[A]], i16 -1, i32 4)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
+ ret <16 x float>%res
+}
+
+define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_rsqrt_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
+
+define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP2]]
+;
+ %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
+ ret <8 x double> %1
+}
+
+define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP13]]
+;
+ %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+ ret <8 x double> %3
+}
+
+define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP11]]
+;
+ %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
+
+define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_round_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP5]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
+ ret <8 x double> %1
+}
+
+define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_round_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP16]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+ ret <8 x double> %3
+}
+
+define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_round_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
+; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP14]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone
+
+define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP2]]
+;
+ %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP13]]
+;
+ %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP11]]
+;
+ %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
+
+define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_round_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP5]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_round_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_round_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP14]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone
+
+define <8 x double> @test_getexp_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_getexp_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
+ ret <8 x double> %res
+}
+define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_getexp_round_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 12)
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 12)
+ ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <16 x float> @test_getexp_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_getexp_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_getexp_round_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_sqrt_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 9)
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]]
+; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 [[MASK]], i32 10)
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
+; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK: 21:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 22:
+; CHECK-NEXT: [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 -1, i32 11)
+; CHECK-NEXT: [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES_2:%.*]] = fadd <4 x float> [[RES2]], [[RES3]]
+; CHECK-NEXT: [[RES:%.*]] = fadd <4 x float> [[RES_1]], [[RES_2]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11)
+
+ %res.1 = fadd <4 x float> %res0, %res1
+ %res.2 = fadd <4 x float> %res2, %res3
+ %res = fadd <4 x float> %res.1, %res.2
+ ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_sqrt_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 9)
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]]
+; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 [[MASK]], i32 10)
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
+; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK: 21:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 22:
+; CHECK-NEXT: [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 -1, i32 11)
+; CHECK-NEXT: [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES_2:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
+; CHECK-NEXT: [[RES:%.*]] = fadd <2 x double> [[RES_1]], [[RES_2]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11)
+
+ %res.1 = fadd <2 x double> %res0, %res1
+ %res.2 = fadd <2 x double> %res2, %res3
+ %res = fadd <2 x double> %res.1, %res.2
+ ret <2 x double> %res
+}
+
+define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttsd2usi(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0]], i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES2]]
+;
+ %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
+ %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
+ %res2 = add i32 %res0, %res1
+ ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttsd2si(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0]], i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES2]]
+;
+ %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
+ %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
+ %res2 = add i32 %res0, %res1
+ ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttss2si(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0:%.*]], i32 8)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0]], i32 4)
+; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES2]]
+;
+ %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
+ %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
+ %res2 = add i32 %res0, %res1
+ ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttss2si_load(ptr %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttss2si_load(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK: 2:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 3:
+; CHECK-NEXT: [[A1:%.*]] = load <4 x float>, ptr [[A0:%.*]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A1]], i32 4)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %a1 = load <4 x float>, ptr %a0
+ %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ;
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttss2usi(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0:%.*]], i32 8)
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0]], i32 4)
+; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES2]]
+;
+ %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
+ %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
+ %res2 = add i32 %res0, %res1
+ ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtsd2usi32(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 11)
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 9)
+; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES4]]
+;
+ %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11)
+ %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtsd2si32(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 11)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 9)
+; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES4]]
+;
+ %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11)
+ %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtss2usi32(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0:%.*]], i32 4)
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 11)
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 9)
+; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES4]]
+;
+ %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11)
+ %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtss2si32(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0:%.*]], i32 4)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 11)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 9)
+; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES4]]
+;
+ %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11)
+ %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
+
+define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, ptr %dst) #0 {
+; CHECK-LABEL: @test_x86_vcvtps2ph_256(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0:%.*]], i32 2, <16 x i16> zeroinitializer, i16 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i16> [[TMP3]] to i256
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES3:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 12, <16 x i16> [[SRC:%.*]], i16 [[MASK]])
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[_MSCMP8]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK: 15:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 16:
+; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], 87960930222080
+; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr [[TMP19]], align 32
+; CHECK-NEXT: store <16 x i16> [[RES1]], ptr [[DST]], align 32
+; CHECK-NEXT: [[RES:%.*]] = add <16 x i16> [[RES2]], [[RES3]]
+; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i16> [[RES]]
+;
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask)
+ store <16 x i16> %res1, ptr %dst
+ %res = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
+
+define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 {
+; CHECK-LABEL: @test_cmpps(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16
+; CHECK-NEXT: store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i16 [[TMP7]]
+;
+ %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+ %1 = bitcast <16 x i1> %res to i16
+ ret i16 %1
+}
+declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
+
+define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 {
+; CHECK-LABEL: @test_cmppd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8
+; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i8 [[TMP7]]
+;
+ %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+ %1 = bitcast <8 x i1> %res to i8
+ ret i8 %1
+}
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
+
+
+ ; fp min - max
+define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) #0 {
+; CHECK-LABEL: @test_vmaxpd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4)
+; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP7]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
+ ret <8 x double> %1
+}
+declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
+
+define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) #0 {
+; CHECK-LABEL: @test_vminpd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4)
+; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP7]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
+ ret <8 x double> %1
+}
+declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
+
+define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_store_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP1]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = and i8 [[TMP1]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = and i8 [[MASK]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP9]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
+; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP14]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK: 16:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 17:
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT: ret void
+;
+ %1 = and i8 %mask, 1
+ %2 = bitcast i8 %1 to <8 x i1>
+ %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %data, ptr %ptr, i32 1, <4 x i1> %extract)
+ ret void
+}
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) #1
+
+
+declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
+declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
+
+define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_rd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_ru(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_rz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_rd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_ru(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_rz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_rd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_ru(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_rz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_rd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_ru(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_rz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+ ret <16 x float> %3
+}
+
+define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP16]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_rd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP16]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_ru(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP16]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_rz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP16]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11)
+ %2 = bitcast i8 %mask to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_min_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_min_round_ps_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_min_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_max_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ %2 = bitcast i16 %mask to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+ ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_max_round_ps_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_max_round_ps_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+ ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
+
+declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_rd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 9)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_ru(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 10)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_rz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 11)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_ss_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_add_ss_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_current_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %a1.val = load float, ptr %a1
+ %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+ %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+ %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+ %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_ss_current_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK: 11:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 12:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %a1.val = load float, ptr %a1
+ %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+ %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+ %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+ %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
+ ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_rd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 9)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_ru(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 10)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_rz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 11)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_current(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_sd_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_add_sd_rn(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_current_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %a1.val = load double, ptr %a1
+ %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+ %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_sd_current_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK: 11:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 12:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %a1.val = load double, ptr %a1
+ %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+ %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+ %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
+ ret <2 x double> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_ss_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_ss_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_max_ss_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_max_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_ss_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %a1.val = load float, ptr %a1
+ %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+ %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+ %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+ %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_ss_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK: 11:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 12:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %a1.val = load float, ptr %a1
+ %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+ %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+ %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+ %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+ %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
+ ret <4 x float> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_sd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_sd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_max_sd_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_max_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_sd_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %a1.val = load double, ptr %a1
+ %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+ %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_sd_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK: 11:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 12:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %a1.val = load double, ptr %a1
+ %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+ %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+ %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtsi2ss32(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 11)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
+
+define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 9)
+; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss_mem(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 9)
+; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %b = load i32, ptr %ptr
+ %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 4)
+; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss_mem(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 4)
+; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %b = load i32, ptr %ptr
+ %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
+
+declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP9]]
+;
+ %x2 = load <16 x i32>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP17]]
+;
+ %x2 = load <16 x i32>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
+ ret <16 x i32> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
+
+define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP9]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
+ ret <8 x double> %1
+}
+
+define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
+; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP20]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
+ %2 = bitcast <8 x i64> %x1 to <8 x double>
+ %3 = bitcast i8 %x3 to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2
+ ret <8 x double> %4
+}
+
+declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
+
+define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP9]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
+ ret <16 x float> %1
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
+; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP20]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
+ %2 = bitcast <16 x i32> %x1 to <16 x float>
+ %3 = bitcast i16 %x3 to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
+ ret <16 x float> %4
+}
+
+declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
+
+define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP4]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP12]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
+ ret <8 x i64> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP17]]
+;
+ %x2 = load <16 x i32>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+ ret <16 x i32> %3
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT: [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT: [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
+; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP23]]
+;
+ %x2s = load double, ptr %x2ptr
+ %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
+ %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
+ %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP12]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP4]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
+ ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP12]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
+ ret <16 x i32> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 11)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x double> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES2]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 10)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x float> [[X2]], i16 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[RES4]]
+;
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+ %res3 = add <16 x i8> %res0, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[RES4]]
+;
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+ %res3 = add <16 x i8> %res0, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[RES4]]
+;
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+ %res3 = add <16 x i8> %res0, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES4]]
+;
+ %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+ %res3 = add <8 x i16> %res0, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES4]]
+;
+ %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+ %res3 = add <8 x i16> %res0, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES4]]
+;
+ %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+ %res3 = add <8 x i16> %res0, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[TMP2]]
+;
+ %1 = trunc <8 x i64> %x0 to <8 x i32>
+ ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]]
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]]
+; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[TMP11]]
+;
+ %1 = trunc <8 x i64> %x0 to <8 x i32>
+ %2 = bitcast i8 %x2 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
+ ret <8 x i32> %3
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]]
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
+; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[TMP10]]
+;
+ %1 = trunc <8 x i64> %x0 to <8 x i32>
+ %2 = bitcast i8 %x2 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
+ ret <8 x i32> %3
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovs_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]])
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+ ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovs_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]])
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+ ret <8 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovus_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]])
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+ ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovus_qd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]])
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+ ret <8 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+ call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+ ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[RES4]]
+;
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+ %res3 = add <16 x i8> %res0, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+ call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+ ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[RES4]]
+;
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+ %res3 = add <16 x i8> %res0, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+ call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+ ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[RES4]]
+;
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+ %res3 = add <16 x i8> %res0, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+ call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+ ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i16> [[RES4]]
+;
+ %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+ %res3 = add <16 x i16> %res0, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+ call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+ ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i16> [[RES4]]
+;
+ %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+ %res3 = add <16 x i16> %res0, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+ call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+ ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i16> [[RES4]]
+;
+ %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+ %res3 = add <16 x i16> %res0, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_mem_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+ call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+ ret void
+}
+
+declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK: 14:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 15:
+; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
+; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %cvt = sitofp <16 x i32> %x0 to <16 x float>
+ %1 = bitcast i16 %x2 to <16 x i1>
+ %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
+ %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
+ %res2 = fadd <16 x float> %2, %3
+ ret <16 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2dq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES2]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
+
+define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0:%.*]], <8 x float> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0]], <8 x float> [[X1]], i8 -1, i32 10)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x float> [[RES2]]
+;
+ %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2udq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 10)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES2]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2dq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0]], <8 x double> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES2]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2udq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2dq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES2]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK: 14:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 15:
+; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
+; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %cvt = uitofp <16 x i32> %x0 to <16 x float>
+ %1 = bitcast i16 %x2 to <16 x i1>
+ %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
+ %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
+ %res2 = fadd <16 x float> %2, %3
+ ret <16 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2udq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i32> [[RES2]]
+;
+ %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2dq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2udq_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_getexp_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_getexp_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 8)
+; CHECK-NEXT: [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES_1]]
+;
+ %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
+ %res.1 = fadd <4 x float> %res0, %res1
+ ret <4 x float> %res.1
+}
+
+define <4 x float> @test_maskz_getexp_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_getexp_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
+ ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_getexp_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_getexp_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 8)
+; CHECK-NEXT: [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES_1]]
+;
+ %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
+ %res.1 = fadd <2 x double> %res0, %res1
+ ret <2 x double> %res.1
+}
+
+define <2 x double> @test_maskz_getexp_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_getexp_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
+ ret <2 x double> %res
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
+
+define i8 at test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 5, i8 [[X3:%.*]], i32 8)
+; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i8 [[RES4]]
+;
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
+ ret i8 %res4
+}
+
+define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd_all(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 2, i8 -1, i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 3, i8 -1, i32 8)
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK: 14:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 15:
+; CHECK-NEXT: [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 4, i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]]
+; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]]
+; CHECK-NEXT: br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
+; CHECK: 18:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 19:
+; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 5, i8 [[X3]], i32 8)
+; CHECK-NEXT: [[TMP20:%.*]] = xor i8 [[RES1]], -1
+; CHECK-NEXT: [[TMP21:%.*]] = xor i8 [[RES2]], -1
+; CHECK-NEXT: [[TMP22:%.*]] = and i8 [[TMP20]], 0
+; CHECK-NEXT: [[TMP23:%.*]] = and i8 0, [[TMP21]]
+; CHECK-NEXT: [[TMP24:%.*]] = or i8 0, [[TMP22]]
+; CHECK-NEXT: [[TMP25:%.*]] = or i8 [[TMP24]], [[TMP23]]
+; CHECK-NEXT: [[RES11:%.*]] = or i8 [[RES1]], [[RES2]]
+; CHECK-NEXT: [[TMP26:%.*]] = xor i8 [[RES3]], -1
+; CHECK-NEXT: [[TMP27:%.*]] = xor i8 [[RES4]], -1
+; CHECK-NEXT: [[TMP28:%.*]] = and i8 [[TMP26]], 0
+; CHECK-NEXT: [[TMP29:%.*]] = and i8 0, [[TMP27]]
+; CHECK-NEXT: [[TMP30:%.*]] = or i8 0, [[TMP28]]
+; CHECK-NEXT: [[TMP31:%.*]] = or i8 [[TMP30]], [[TMP29]]
+; CHECK-NEXT: [[RES12:%.*]] = or i8 [[RES3]], [[RES4]]
+; CHECK-NEXT: [[TMP32:%.*]] = xor i8 [[RES11]], -1
+; CHECK-NEXT: [[TMP33:%.*]] = xor i8 [[RES12]], -1
+; CHECK-NEXT: [[TMP34:%.*]] = and i8 [[TMP25]], [[TMP31]]
+; CHECK-NEXT: [[TMP35:%.*]] = and i8 [[TMP32]], [[TMP31]]
+; CHECK-NEXT: [[TMP36:%.*]] = and i8 [[TMP25]], [[TMP33]]
+; CHECK-NEXT: [[TMP37:%.*]] = or i8 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP38:%.*]] = or i8 [[TMP37]], [[TMP36]]
+; CHECK-NEXT: [[RES13:%.*]] = or i8 [[RES11]], [[RES12]]
+; CHECK-NEXT: store i8 [[TMP38]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i8 [[RES13]]
+;
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
+
+ %res11 = or i8 %res1, %res2
+ %res12 = or i8 %res3, %res4
+ %res13 = or i8 %res11, %res12
+ ret i8 %res13
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
+
+define i8 at test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 3, i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i8 [[RES2]]
+;
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
+ ret i8 %res2
+}
+
+
+define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss_all(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 2, i8 -1, i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 3, i8 -1, i32 8)
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
+; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK: 14:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 15:
+; CHECK-NEXT: [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 4, i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]]
+; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]]
+; CHECK-NEXT: br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
+; CHECK: 18:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 19:
+; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 5, i8 [[X3]], i32 8)
+; CHECK-NEXT: [[TMP20:%.*]] = and i8 [[RES1]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i8 0, [[RES2]]
+; CHECK-NEXT: [[TMP22:%.*]] = or i8 0, [[TMP20]]
+; CHECK-NEXT: [[TMP23:%.*]] = or i8 [[TMP22]], [[TMP21]]
+; CHECK-NEXT: [[RES11:%.*]] = and i8 [[RES1]], [[RES2]]
+; CHECK-NEXT: [[TMP24:%.*]] = and i8 [[RES3]], 0
+; CHECK-NEXT: [[TMP25:%.*]] = and i8 0, [[RES4]]
+; CHECK-NEXT: [[TMP26:%.*]] = or i8 0, [[TMP24]]
+; CHECK-NEXT: [[TMP27:%.*]] = or i8 [[TMP26]], [[TMP25]]
+; CHECK-NEXT: [[RES12:%.*]] = and i8 [[RES3]], [[RES4]]
+; CHECK-NEXT: [[TMP28:%.*]] = and i8 [[TMP23]], [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = and i8 [[RES11]], [[TMP27]]
+; CHECK-NEXT: [[TMP30:%.*]] = and i8 [[TMP23]], [[RES12]]
+; CHECK-NEXT: [[TMP31:%.*]] = or i8 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP32:%.*]] = or i8 [[TMP31]], [[TMP30]]
+; CHECK-NEXT: [[RES13:%.*]] = and i8 [[RES11]], [[RES12]]
+; CHECK-NEXT: store i8 [[TMP32]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i8 [[RES13]]
+;
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
+
+ %res11 = and i8 %res1, %res2
+ %res12 = and i8 %res3, %res4
+ %res13 = and i8 %res11, %res12
+ ret i8 %res13
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0:%.*]], i32 11, <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0]], i32 11, <8 x double> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES2]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0:%.*]], i32 11, <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 4)
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK: 10:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 11:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0]], i32 11, <16 x float> [[X2]], i16 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 11, <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 12, <2 x double> zeroinitializer, i8 [[X3]], i32 4)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 13, <2 x double> [[X2]], i8 [[X3]], i32 8)
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP21:%.*]] = icmp ne i128 [[TMP21]], 0
+; CHECK-NEXT: [[_MSOR22:%.*]] = or i1 [[_MSOR20]], [[_MSCMP21]]
+; CHECK-NEXT: br i1 [[_MSOR22]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK: 22:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 23:
+; CHECK-NEXT: [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 14, <2 x double> [[X2]], i8 -1, i32 4)
+; CHECK-NEXT: [[RES11:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES12:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
+; CHECK-NEXT: [[RES13:%.*]] = fadd <2 x double> [[RES11]], [[RES12]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES13]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 12, <2 x double> zeroinitializer, i8 %x3, i32 4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 13, <2 x double> %x2, i8 %x3, i32 8)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 14, <2 x double> %x2, i8 -1, i32 4)
+ %res11 = fadd <2 x double> %res, %res1
+ %res12 = fadd <2 x double> %res2, %res3
+ %res13 = fadd <2 x double> %res11, %res12
+ ret <2 x double> %res13
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 11, <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 12, <4 x float> zeroinitializer, i8 [[X3]], i32 4)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 13, <4 x float> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT: [[_MSOR18:%.*]] = or i1 [[_MSCMP16]], [[_MSCMP17]]
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP21]], 0
+; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSOR18]], [[_MSCMP19]]
+; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK: 22:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 23:
+; CHECK-NEXT: [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 14, <4 x float> [[X2]], i8 -1, i32 4)
+; CHECK-NEXT: [[RES11:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES12:%.*]] = fadd <4 x float> [[RES2]], [[RES3]]
+; CHECK-NEXT: [[RES13:%.*]] = fadd <4 x float> [[RES11]], [[RES12]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES13]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 12, <4 x float> zeroinitializer, i8 %x3, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 13, <4 x float> %x2, i8 -1, i32 8)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 14, <4 x float> %x2, i8 -1, i32 4)
+ %res11 = fadd <4 x float> %res, %res1
+ %res12 = fadd <4 x float> %res2, %res3
+ %res13 = fadd <4 x float> %res11, %res12
+ ret <4 x float> %res13
+}
+
+define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, ptr %x1p) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss_load(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[X1:%.*]] = load <4 x float>, ptr [[X1P:%.*]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[X1P]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1]], i32 11, <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %x1 = load <4 x float>, ptr %x1p
+ %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
+
+define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
+ ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_mask(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> [[X2]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES2]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2
+ ret <8 x double> %res2
+}
+
+define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_maskz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES2]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
+ ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_mask(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
+ ret <16 x float> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_maskz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
+ ret <16 x float> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK: 3:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 4:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+ ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
+ ret <16 x float> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP7]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES2]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
+ ret <16 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ss2sd_round(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0]], <4 x float> [[X1]], <2 x double> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES2]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_sd2ss_round(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0:%.*]], <2 x double> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 11)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK: 13:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 14:
+; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0]], <2 x double> [[X1]], <4 x float> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES2]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
+
+define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP9]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
+ ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP17]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
+ %2 = bitcast i16 %x4 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
+ ret <16 x i32> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP17]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
+ %2 = bitcast i16 %x4 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+ ret <16 x i32> %3
+}
+
+declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP9]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
+ ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP17]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
+ %2 = bitcast i8 %x4 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP17]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
+ %2 = bitcast i8 %x4 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
+}
+
+define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_eq_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 8)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 8)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_eq(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 4)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 4)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_lt_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 8)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt_sae(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 8)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_lt(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 4)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
+ ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 4)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
+
+define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_ss_lt(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i32 9, i32 4)
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+
+declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
+
+define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP7]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
+ ret <8 x double> %1
+}
+
+define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP18]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
+ ret <8 x double> %3
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[TMP16]]
+;
+ %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
+
+define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP3]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
+ ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP12]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
+ ret <8 x i64> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[TMP11]]
+;
+ %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
+}
+
+declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
+
+define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP7]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
+ ret <16 x float> %1
+}
+
+define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK: 7:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP18]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
+ ret <16 x float> %3
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP16]]
+;
+ %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+ ret <16 x float> %3
+}
+
+declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP3]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
+ ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP12]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
+ ret <16 x i32> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[TMP11]]
+;
+ %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
+ %2 = bitcast i16 %x3 to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+ ret <16 x i32> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 4, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> [[X1]], <8 x i64> [[X2]], i32 5, i8 [[X4]], i32 4)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 3, i8 -1, i32 8)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES4]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, ptr %x2ptr) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512_load(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[X2:%.*]] = load <8 x i64>, ptr [[X2PTR:%.*]], align 64
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[_MSLD]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
+; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2]], i32 3, i8 -1, i32 4)
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %x2 = load <8 x i64>, ptr %x2ptr
+ %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4)
+ ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_pd_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 3, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 4)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 2, i8 -1, i32 8)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES4]]
+;
+ %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
+ %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 4)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 -1, i32 8)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES4]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 6, i8 -1, i32 4)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES4]]
+;
+ %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 6, i8 -1, i32 4)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 5, i16 [[X4]], i32 4)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 5, i16 -1, i32 8)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES4]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, ptr %x2ptr) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512_load(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2PTR:%.*]], align 64
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
+; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2]], i32 5, i16 -1, i32 4)
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %x2 = load <16 x i32>, ptr %x2ptr
+ %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
+ ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ps_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 6, i16 [[X4]], i32 8)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 7, i16 -1, i32 4)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES4]]
+;
+ %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 6, i16 %x4, i32 8)
+ %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 7, i16 -1, i32 4)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 6, i8 -1, i32 4)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES4]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 6, i8 -1, i32 4)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK: 12:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 13:
+; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK: 17:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 18:
+; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES4]]
+;
+ %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare double @llvm.fma.f64(double, double, double) #1
+declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
+
+define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP5]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0
+; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK: 23:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 24:
+; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
+; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
+; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]]
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP27]] to i64
+; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0
+; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]]
+; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
+; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]]
+; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0
+; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
+; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
+; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES4]]
+;
+ %1 = extractelement <2 x double> %x0, i64 0
+ %2 = extractelement <2 x double> %x1, i64 0
+ %3 = extractelement <2 x double> %x2, i64 0
+ %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, double %4, double %1
+ %8 = insertelement <2 x double> %x0, double %7, i64 0
+ %9 = extractelement <2 x double> %x0, i64 0
+ %10 = extractelement <2 x double> %x1, i64 0
+ %11 = extractelement <2 x double> %x2, i64 0
+ %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
+ %13 = insertelement <2 x double> %x0, double %12, i64 0
+ %14 = extractelement <2 x double> %x0, i64 0
+ %15 = extractelement <2 x double> %x1, i64 0
+ %16 = extractelement <2 x double> %x2, i64 0
+ %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
+ %18 = bitcast i8 %x3 to <8 x i1>
+ %19 = extractelement <8 x i1> %18, i64 0
+ %20 = select i1 %19, double %17, double %14
+ %21 = insertelement <2 x double> %x0, double %20, i64 0
+ %res3 = fadd <2 x double> %8, %13
+ %res4 = fadd <2 x double> %21, %res3
+ ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP5]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]]
+; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
+; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK: 23:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 24:
+; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
+; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
+; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]]
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP27]] to i32
+; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]]
+; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
+; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]]
+; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0
+; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
+; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
+; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES4]]
+;
+ %1 = extractelement <4 x float> %x0, i64 0
+ %2 = extractelement <4 x float> %x1, i64 0
+ %3 = extractelement <4 x float> %x2, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, float %4, float %1
+ %8 = insertelement <4 x float> %x0, float %7, i64 0
+ %9 = extractelement <4 x float> %x0, i64 0
+ %10 = extractelement <4 x float> %x1, i64 0
+ %11 = extractelement <4 x float> %x2, i64 0
+ %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
+ %13 = insertelement <4 x float> %x0, float %12, i64 0
+ %14 = extractelement <4 x float> %x0, i64 0
+ %15 = extractelement <4 x float> %x1, i64 0
+ %16 = extractelement <4 x float> %x2, i64 0
+ %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
+ %18 = bitcast i8 %x3 to <8 x i1>
+ %19 = extractelement <8 x i1> %18, i64 0
+ %20 = select i1 %19, float %17, float %14
+ %21 = insertelement <4 x float> %x0, float %20, i64 0
+ %res3 = fadd <4 x float> %8, %13
+ %res4 = fadd <4 x float> %21, %res3
+ ret <4 x float> %res4
+}
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd(
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 0, i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast double [[TMP4]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP10]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i64 [[TMP11]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> [[X0]], double [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]], i32 11)
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 0, i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 0
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP22]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select i1 false, i64 [[TMP24]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], double [[TMP17]], double 0.000000e+00
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
+; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[TMP13]], [[TMP26]]
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES2]]
+;
+ %1 = extractelement <2 x double> %x0, i64 0
+ %2 = extractelement <2 x double> %x1, i64 0
+ %3 = extractelement <2 x double> %x2, i64 0
+ %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, double %4, double 0.000000e+00
+ %8 = insertelement <2 x double> %x0, double %7, i64 0
+ %9 = extractelement <2 x double> %x0, i64 0
+ %10 = extractelement <2 x double> %x1, i64 0
+ %11 = extractelement <2 x double> %x2, i64 0
+ %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
+ %13 = bitcast i8 %x3 to <8 x i1>
+ %14 = extractelement <8 x i1> %13, i64 0
+ %15 = select i1 %14, double %12, double 0.000000e+00
+ %16 = insertelement <2 x double> %x0, double %15, i64 0
+ %res2 = fadd <2 x double> %8, %16
+ ret <2 x double> %res2
+}
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss(
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast float [[TMP4]] to i32
+; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP8]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[TMP10]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP11]], i32 [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[X0]], float [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]], i32 11)
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 0
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP21]], 0
+; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select i1 false, i32 [[TMP24]], i32 [[TMP20]]
+; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP17]], float 0.000000e+00
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
+; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[TMP13]], [[TMP26]]
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES2]]
+;
+ %1 = extractelement <4 x float> %x0, i64 0
+ %2 = extractelement <4 x float> %x1, i64 0
+ %3 = extractelement <4 x float> %x2, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, float %4, float 0.000000e+00
+ %8 = insertelement <4 x float> %x0, float %7, i64 0
+ %9 = extractelement <4 x float> %x0, i64 0
+ %10 = extractelement <4 x float> %x1, i64 0
+ %11 = extractelement <4 x float> %x2, i64 0
+ %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
+ %13 = bitcast i8 %x3 to <8 x i1>
+ %14 = extractelement <8 x i1> %13, i64 0
+ %15 = select i1 %14, float %12, float 0.000000e+00
+ %16 = insertelement <4 x float> %x0, float %15, i64 0
+ %res2 = fadd <4 x float> %8, %16
+ ret <4 x float> %res2
+}
+
+define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr nocapture readonly %1, float %2, float %3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_load0(
+; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
+; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP14]], align 16
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[_MSLD]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT: [[_MSPROP2:%.*]] = or i32 [[_MSPROP1]], [[TMP7]]
+; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.fma.f32(float [[TMP15]], float [[TMP2:%.*]], float [[TMP3:%.*]])
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP0:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <8 x i1> [[TMP17]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[_MSPROP2]], i32 0
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP16]] to i32
+; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP21]], 0
+; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP3]], i32 [[TMP24]], i32 [[TMP20]]
+; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP16]], float 0.000000e+00
+; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <4 x i32> [[_MSLD]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP25]], i64 0
+; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP26]]
+;
+ %5 = load <4 x float>, ptr %1, align 16
+ %6 = extractelement <4 x float> %5, i64 0
+ %7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2
+ %8 = bitcast i8 %0 to <8 x i1>
+ %9 = extractelement <8 x i1> %8, i64 0
+ %10 = select i1 %9, float %7, float 0.000000e+00
+ %11 = insertelement <4 x float> %5, float %10, i64 0
+ ret <4 x float> %11
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP7]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]]
+; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0
+; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK: 23:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 24:
+; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
+; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
+; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]]
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP29]] to i64
+; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0
+; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]]
+; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
+; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]]
+; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0
+; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
+; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
+; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES4]]
+;
+ %1 = extractelement <2 x double> %x0, i64 0
+ %2 = extractelement <2 x double> %x1, i64 0
+ %3 = extractelement <2 x double> %x2, i64 0
+ %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, double %4, double %3
+ %8 = insertelement <2 x double> %x2, double %7, i64 0
+ %9 = extractelement <2 x double> %x0, i64 0
+ %10 = extractelement <2 x double> %x1, i64 0
+ %11 = extractelement <2 x double> %x2, i64 0
+ %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
+ %13 = insertelement <2 x double> %x2, double %12, i64 0
+ %14 = extractelement <2 x double> %x0, i64 0
+ %15 = extractelement <2 x double> %x1, i64 0
+ %16 = extractelement <2 x double> %x2, i64 0
+ %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
+ %18 = bitcast i8 %x3 to <8 x i1>
+ %19 = extractelement <8 x i1> %18, i64 0
+ %20 = select i1 %19, double %17, double %16
+ %21 = insertelement <2 x double> %x2, double %20, i64 0
+ %res3 = fadd <2 x double> %8, %13
+ %res4 = fadd <2 x double> %21, %res3
+ ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]]
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP7]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]]
+; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0
+; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
+; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK: 23:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 24:
+; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
+; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
+; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]]
+; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP29]] to i32
+; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]]
+; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
+; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]]
+; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0
+; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
+; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
+; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES4]]
+;
+ %1 = extractelement <4 x float> %x0, i64 0
+ %2 = extractelement <4 x float> %x1, i64 0
+ %3 = extractelement <4 x float> %x2, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, float %4, float %3
+ %8 = insertelement <4 x float> %x2, float %7, i64 0
+ %9 = extractelement <4 x float> %x0, i64 0
+ %10 = extractelement <4 x float> %x1, i64 0
+ %11 = extractelement <4 x float> %x2, i64 0
+ %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
+ %13 = insertelement <4 x float> %x2, float %12, i64 0
+ %14 = extractelement <4 x float> %x0, i64 0
+ %15 = extractelement <4 x float> %x1, i64 0
+ %16 = extractelement <4 x float> %x2, i64 0
+ %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
+ %18 = bitcast i8 %x3 to <8 x i1>
+ %19 = extractelement <8 x i1> %18, i64 0
+ %20 = select i1 %19, float %17, float %16
+ %21 = insertelement <4 x float> %x2, float %20, i64 0
+ %res3 = fadd <4 x float> %8, %13
+ %res4 = fadd <4 x float> %21, %res3
+ ret <4 x float> %res4
+}
+
+define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_ss_mask_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
+; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
+; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
+; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
+; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
+; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
+; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]]
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast float [[TMP14]] to i32
+; CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]]
+; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]]
+; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]]
+; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0
+; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
+; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0
+; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK: 29:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 30:
+; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
+; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP33]], align 4
+; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4
+; CHECK-NEXT: ret void
+;
+ %a.val = load float, ptr %a
+ %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+ %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+ %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+ %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+ %b.val = load float, ptr %b
+ %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+ %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+ %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+ %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+ %1 = extractelement <4 x float> %av, i64 0
+ %2 = extractelement <4 x float> %bv, i64 0
+ %3 = extractelement <4 x float> %av, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = bitcast i8 %c to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, float %4, float %1
+ %8 = insertelement <4 x float> %av, float %7, i64 0
+ %sr = extractelement <4 x float> %8, i32 0
+ store float %sr, ptr %a
+ ret void
+}
+
+define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_ss_maskz_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
+; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
+; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
+; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
+; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
+; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
+; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = xor i32 [[TMP22]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]]
+; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]]
+; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00
+; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0
+; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
+; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0
+; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK: 28:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 29:
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
+; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP32]], align 4
+; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4
+; CHECK-NEXT: ret void
+;
+ %a.val = load float, ptr %a
+ %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+ %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+ %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+ %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+ %b.val = load float, ptr %b
+ %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+ %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+ %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+ %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+ %1 = extractelement <4 x float> %av, i64 0
+ %2 = extractelement <4 x float> %bv, i64 0
+ %3 = extractelement <4 x float> %av, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = bitcast i8 %c to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, float %4, float 0.000000e+00
+ %8 = insertelement <4 x float> %av, float %7, i64 0
+ %sr = extractelement <4 x float> %8, i32 0
+ store float %sr, ptr %a
+ ret void
+}
+
+define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_sd_mask_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
+; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
+; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
+; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
+; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast double [[TMP14]] to i64
+; CHECK-NEXT: [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]]
+; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]]
+; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
+; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK: 29:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 30:
+; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
+; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP33]], align 8
+; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8
+; CHECK-NEXT: ret void
+;
+ %a.val = load double, ptr %a
+ %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+ %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+ %b.val = load double, ptr %b
+ %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+ %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+ %1 = extractelement <2 x double> %av, i64 0
+ %2 = extractelement <2 x double> %bv, i64 0
+ %3 = extractelement <2 x double> %av, i64 0
+ %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+ %5 = bitcast i8 %c to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, double %4, double %1
+ %8 = insertelement <2 x double> %av, double %7, i64 0
+ %sr = extractelement <2 x double> %8, i32 0
+ store double %sr, ptr %a
+ ret void
+}
+
+define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_sd_maskz_memfold(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK: 9:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 10:
+; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
+; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
+; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
+; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
+; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
+; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = xor i64 [[TMP22]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]]
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00
+; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
+; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0
+; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK: 28:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 29:
+; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
+; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP32]], align 8
+; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8
+; CHECK-NEXT: ret void
+;
+ %a.val = load double, ptr %a
+ %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+ %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+ %b.val = load double, ptr %b
+ %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+ %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+ %1 = extractelement <2 x double> %av, i64 0
+ %2 = extractelement <2 x double> %bv, i64 0
+ %3 = extractelement <2 x double> %av, i64 0
+ %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+ %5 = bitcast i8 %c to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, double %4, double 0.000000e+00
+ %8 = insertelement <2 x double> %av, double %7, i64 0
+ %sr = extractelement <2 x double> %8, i32 0
+ store double %sr, ptr %a
+ ret void
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]]
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]])
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast double [[TMP9]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]]
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK: 26:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 27:
+; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11)
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0
+; CHECK-NEXT: [[TMP31:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
+; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
+; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
+; CHECK: 35:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 36:
+; CHECK-NEXT: [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10)
+; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
+; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]]
+; CHECK-NEXT: [[TMP43:%.*]] = bitcast double [[TMP37]] to i64
+; CHECK-NEXT: [[TMP44:%.*]] = bitcast double [[TMP38]] to i64
+; CHECK-NEXT: [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP45]], 0
+; CHECK-NEXT: [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]]
+; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]]
+; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]]
+; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT: [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0
+; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]]
+; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]]
+; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES4]]
+;
+ %1 = fneg <2 x double> %x2
+ %2 = extractelement <2 x double> %x0, i64 0
+ %3 = extractelement <2 x double> %x1, i64 0
+ %4 = extractelement <2 x double> %1, i64 0
+ %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+ %6 = extractelement <2 x double> %x2, i64 0
+ %7 = bitcast i8 %x3 to <8 x i1>
+ %8 = extractelement <8 x i1> %7, i64 0
+ %9 = select i1 %8, double %5, double %6
+ %10 = insertelement <2 x double> %x2, double %9, i64 0
+ %11 = fneg <2 x double> %x2
+ %12 = extractelement <2 x double> %x0, i64 0
+ %13 = extractelement <2 x double> %x1, i64 0
+ %14 = extractelement <2 x double> %11, i64 0
+ %15 = call double @llvm.x86.avx512.vfmadd.f64(double %12, double %13, double %14, i32 11)
+ %16 = extractelement <2 x double> %x2, i64 0
+ %17 = insertelement <2 x double> %x2, double %15, i64 0
+ %18 = fneg <2 x double> %x2
+ %19 = extractelement <2 x double> %x0, i64 0
+ %20 = extractelement <2 x double> %x1, i64 0
+ %21 = extractelement <2 x double> %18, i64 0
+ %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 10)
+ %23 = extractelement <2 x double> %x2, i64 0
+ %24 = bitcast i8 %x3 to <8 x i1>
+ %25 = extractelement <8 x i1> %24, i64 0
+ %26 = select i1 %25, double %22, double %23
+ %27 = insertelement <2 x double> %x2, double %26, i64 0
+ %res3 = fadd <2 x double> %10, %17
+ %res4 = fadd <2 x double> %27, %res3
+ ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]]
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]])
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast float [[TMP9]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT: [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]]
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK: 26:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 27:
+; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11)
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0
+; CHECK-NEXT: [[TMP31:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
+; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
+; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
+; CHECK: 35:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 36:
+; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10)
+; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
+; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]]
+; CHECK-NEXT: [[TMP43:%.*]] = bitcast float [[TMP37]] to i32
+; CHECK-NEXT: [[TMP44:%.*]] = bitcast float [[TMP38]] to i32
+; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]]
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP45]], 0
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]]
+; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]]
+; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]]
+; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0
+; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]]
+; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]]
+; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES4]]
+;
+ %1 = fneg <4 x float> %x2
+ %2 = extractelement <4 x float> %x0, i64 0
+ %3 = extractelement <4 x float> %x1, i64 0
+ %4 = extractelement <4 x float> %1, i64 0
+ %5 = call float @llvm.fma.f32(float %2, float %3, float %4)
+ %6 = extractelement <4 x float> %x2, i64 0
+ %7 = bitcast i8 %x3 to <8 x i1>
+ %8 = extractelement <8 x i1> %7, i64 0
+ %9 = select i1 %8, float %5, float %6
+ %10 = insertelement <4 x float> %x2, float %9, i64 0
+ %11 = fneg <4 x float> %x2
+ %12 = extractelement <4 x float> %x0, i64 0
+ %13 = extractelement <4 x float> %x1, i64 0
+ %14 = extractelement <4 x float> %11, i64 0
+ %15 = call float @llvm.x86.avx512.vfmadd.f32(float %12, float %13, float %14, i32 11)
+ %16 = extractelement <4 x float> %x2, i64 0
+ %17 = insertelement <4 x float> %x2, float %15, i64 0
+ %18 = fneg <4 x float> %x2
+ %19 = extractelement <4 x float> %x0, i64 0
+ %20 = extractelement <4 x float> %x1, i64 0
+ %21 = extractelement <4 x float> %18, i64 0
+ %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 10)
+ %23 = extractelement <4 x float> %x2, i64 0
+ %24 = bitcast i8 %x3 to <8 x i1>
+ %25 = extractelement <8 x i1> %24, i64 0
+ %26 = select i1 %25, float %22, float %23
+ %27 = insertelement <4 x float> %x2, float %26, i64 0
+ %res3 = fadd <4 x float> %10, %17
+ %res4 = fadd <4 x float> %27, %res3
+ ret <4 x float> %res4
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]]
+; CHECK-NEXT: [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]]
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast double [[TMP11]] to i64
+; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]]
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0
+; CHECK-NEXT: [[TMP23:%.*]] = fneg <2 x double> [[X0]]
+; CHECK-NEXT: [[TMP24:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK: 28:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 29:
+; CHECK-NEXT: [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11)
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0
+; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0
+; CHECK-NEXT: [[TMP33:%.*]] = fneg <2 x double> [[X0]]
+; CHECK-NEXT: [[TMP34:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
+; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
+; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK: 38:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 39:
+; CHECK-NEXT: [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10)
+; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
+; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]]
+; CHECK-NEXT: [[TMP46:%.*]] = bitcast double [[TMP40]] to i64
+; CHECK-NEXT: [[TMP47:%.*]] = bitcast double [[TMP41]] to i64
+; CHECK-NEXT: [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP49:%.*]] = or i64 [[TMP48]], 0
+; CHECK-NEXT: [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]]
+; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]]
+; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]]
+; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0
+; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]]
+; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]]
+; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x double> [[RES4]]
+;
+ %1 = fneg <2 x double> %x0
+ %2 = fneg <2 x double> %x2
+ %3 = extractelement <2 x double> %1, i64 0
+ %4 = extractelement <2 x double> %x1, i64 0
+ %5 = extractelement <2 x double> %2, i64 0
+ %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+ %7 = extractelement <2 x double> %x2, i64 0
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = extractelement <8 x i1> %8, i64 0
+ %10 = select i1 %9, double %6, double %7
+ %11 = insertelement <2 x double> %x2, double %10, i64 0
+ %12 = fneg <2 x double> %x0
+ %13 = fneg <2 x double> %x2
+ %14 = extractelement <2 x double> %12, i64 0
+ %15 = extractelement <2 x double> %x1, i64 0
+ %16 = extractelement <2 x double> %13, i64 0
+ %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
+ %18 = extractelement <2 x double> %x2, i64 0
+ %19 = insertelement <2 x double> %x2, double %17, i64 0
+ %20 = fneg <2 x double> %x0
+ %21 = fneg <2 x double> %x2
+ %22 = extractelement <2 x double> %20, i64 0
+ %23 = extractelement <2 x double> %x1, i64 0
+ %24 = extractelement <2 x double> %21, i64 0
+ %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 10)
+ %26 = extractelement <2 x double> %x2, i64 0
+ %27 = bitcast i8 %x3 to <8 x i1>
+ %28 = extractelement <8 x i1> %27, i64 0
+ %29 = select i1 %28, double %25, double %26
+ %30 = insertelement <2 x double> %x2, double %29, i64 0
+ %res3 = fadd <2 x double> %11, %19
+ %res4 = fadd <2 x double> %30, %res3
+ ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]]
+; CHECK-NEXT: [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]]
+; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast float [[TMP11]] to i32
+; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]]
+; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]]
+; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]]
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0
+; CHECK-NEXT: [[TMP23:%.*]] = fneg <4 x float> [[X0]]
+; CHECK-NEXT: [[TMP24:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0
+; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
+; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK: 28:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 29:
+; CHECK-NEXT: [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11)
+; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0
+; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0
+; CHECK-NEXT: [[TMP33:%.*]] = fneg <4 x float> [[X0]]
+; CHECK-NEXT: [[TMP34:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0
+; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
+; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
+; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
+; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK: 38:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 39:
+; CHECK-NEXT: [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10)
+; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
+; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]]
+; CHECK-NEXT: [[TMP46:%.*]] = bitcast float [[TMP40]] to i32
+; CHECK-NEXT: [[TMP47:%.*]] = bitcast float [[TMP41]] to i32
+; CHECK-NEXT: [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], 0
+; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]]
+; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]]
+; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]]
+; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0
+; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]]
+; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]]
+; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[RES4]]
+;
+ %1 = fneg <4 x float> %x0
+ %2 = fneg <4 x float> %x2
+ %3 = extractelement <4 x float> %1, i64 0
+ %4 = extractelement <4 x float> %x1, i64 0
+ %5 = extractelement <4 x float> %2, i64 0
+ %6 = call float @llvm.fma.f32(float %3, float %4, float %5)
+ %7 = extractelement <4 x float> %x2, i64 0
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = extractelement <8 x i1> %8, i64 0
+ %10 = select i1 %9, float %6, float %7
+ %11 = insertelement <4 x float> %x2, float %10, i64 0
+ %12 = fneg <4 x float> %x0
+ %13 = fneg <4 x float> %x2
+ %14 = extractelement <4 x float> %12, i64 0
+ %15 = extractelement <4 x float> %x1, i64 0
+ %16 = extractelement <4 x float> %13, i64 0
+ %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
+ %18 = extractelement <4 x float> %x2, i64 0
+ %19 = insertelement <4 x float> %x2, float %17, i64 0
+ %20 = fneg <4 x float> %x0
+ %21 = fneg <4 x float> %x2
+ %22 = extractelement <4 x float> %20, i64 0
+ %23 = extractelement <4 x float> %x1, i64 0
+ %24 = extractelement <4 x float> %21, i64 0
+ %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 10)
+ %26 = extractelement <4 x float> %x2, i64 0
+ %27 = bitcast i8 %x3 to <8 x i1>
+ %28 = extractelement <8 x i1> %27, i64 0
+ %29 = select i1 %28, float %25, float %26
+ %30 = insertelement <4 x float> %x2, float %29, i64 0
+ %res3 = fadd <4 x float> %11, %19
+ %res4 = fadd <4 x float> %30, %res3
+ ret <4 x float> %res4
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP12]] to i32
+; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
+; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]]
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0
+; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP24]]
+;
+ %q = load float, ptr %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %1 = extractelement <4 x float> %x0, i64 0
+ %2 = extractelement <4 x float> %vecinit.i, i64 0
+ %3 = extractelement <4 x float> %x1, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, float %4, float %3
+ %8 = insertelement <4 x float> %x1, float %7, i64 0
+ ret <4 x float> %8
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK: 5:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 6:
+; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
+; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]]
+; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0
+; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP24]]
+;
+ %q = load float, ptr %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %1 = extractelement <4 x float> %x0, i64 0
+ %2 = extractelement <4 x float> %vecinit.i, i64 0
+ %3 = extractelement <4 x float> %x1, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = extractelement <8 x i1> %5, i64 0
+ %7 = select i1 %6, float %4, float %1
+ %8 = insertelement <4 x float> %x0, float %7, i64 0
+ ret <4 x float> %8
+}
+
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm(
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK: 4:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 5:
+; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP12]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP14]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]]
+; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], 0
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00
+; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
+; CHECK-NEXT: store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x float> [[TMP19]]
+;
+ %q = load float, ptr %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %1 = extractelement <4 x float> %x0, i64 0
+ %2 = extractelement <4 x float> %x1, i64 0
+ %3 = extractelement <4 x float> %vecinit.i, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = select i1 false, float %4, float 0.000000e+00
+ %6 = insertelement <4 x float> %x0, float %5, i64 0
+ ret <4 x float> %6
+}
+
+define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psll_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psll_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+ ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psll_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psll_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psll_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+ ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psll_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_pslli_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_pslli_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+ ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_pslli_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_pslli_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+ ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psra_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psra_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+ ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psra_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psra_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psra_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+ ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psra_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
+
+
+
+define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrai_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrai_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+ ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrai_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrai_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+ ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
+
+
+
+define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrl_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrl_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+ ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrl_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrl_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+ ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrli_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrli_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+ ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrli_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrli_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+ ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_psllv_d_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_d_512_const(
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+ %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+ %res2 = add <16 x i32> %res0, %res1
+ ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psllv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
+ ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_psllv_q_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_q_512_const(
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
+ %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+ %res2 = add <8 x i64> %res0, %res1
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psllv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrav_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrav_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
+ ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrav_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrav_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_psrlv_d_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_d_512_const(
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+ %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1 >)
+ %res2 = add <16 x i32> %res0, %res1
+ ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
+ ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i32> [[RES2]]
+;
+ %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+ ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_psrlv_q_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_q_512_const(
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
+ %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+ %res2 = add <8 x i64> %res0, %res1
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES2]]
+;
+ %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+ ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+
+define <8 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castpd128_pd256_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[A1:%.*]] = freeze <2 x double> poison
+; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %a1 = freeze <2 x double> poison
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ ret <8 x double> %res
+}
+
+
+define <8 x double> @test_mm256_castpd256_pd256_freeze(<4 x double> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castpd256_pd256_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[A1:%.*]] = freeze <4 x double> poison
+; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x double> [[RES]]
+;
+ %a1 = freeze <4 x double> poison
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+
+define <16 x float> @test_mm256_castps128_ps512_freeze(<4 x float> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castps128_ps512_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[A1:%.*]] = freeze <4 x float> poison
+; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %a1 = freeze <4 x float> poison
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm256_castps256_ps512_freeze(<8 x float> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castps256_ps512_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[A1:%.*]] = freeze <8 x float> poison
+; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[RES]]
+;
+ %a1 = freeze <8 x float> poison
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+
+define <8 x i64> @test_mm512_castsi128_si512_freeze(<2 x i64> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm512_castsi128_si512_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[A1:%.*]] = freeze <2 x i64> poison
+; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %a1 = freeze <2 x i64> poison
+ %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ ret <8 x i64> %res
+}
+
+
+define <8 x i64> @test_mm512_castsi256_si512_pd256_freeze(<4 x i64> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm512_castsi256_si512_pd256_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[A1:%.*]] = freeze <4 x i64> poison
+; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %a1 = freeze <4 x i64> poison
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %res
+}
+
+
+define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 {
+; CHECK-LABEL: @bad_mask_transition(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK: 8:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP10]] to i8
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK: 14:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 15:
+; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[C:%.*]], <8 x double> [[D:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i1> [[TMP16]] to i8
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i16
+; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP17]] to i16
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[CONV]] to <16 x i1>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[CONV2]] to <16 x i1>
+; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP18]], <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i1> [[TMP19]], <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> [[TMP4]], <16 x i32> [[TMP5]]
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP26:%.*]] = xor <16 x i32> [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = or <16 x i32> [[TMP26]], [[TMP4]]
+; CHECK-NEXT: [[TMP28:%.*]] = or <16 x i32> [[TMP27]], [[TMP5]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP28]], <16 x i32> [[TMP23]]
+; CHECK-NEXT: [[TMP29:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[F]], <16 x float> [[E]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP29]]
+;
+entry:
+ %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+ %1 = bitcast <8 x i1> %0 to i8
+ %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+ %3 = bitcast <8 x i1> %2 to i8
+ %conv = zext i8 %1 to i16
+ %conv2 = zext i8 %3 to i16
+ %4 = bitcast i16 %conv to <16 x i1>
+ %5 = bitcast i16 %conv2 to <16 x i1>
+ %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
+ ret <16 x float> %9
+}
+
+define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 {
+; CHECK-LABEL: @bad_mask_transition_2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK: 6:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP8]] to i8
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP9]] to i16
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[CONV]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP16]], <16 x i32> [[TMP11]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[F]], <16 x float> [[E]]
+; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x float> [[TMP17]]
+;
+entry:
+ %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv = zext i8 %1 to i16
+ %2 = bitcast i16 %conv to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
+ ret <16 x float> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>)
+declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>)
+declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
+declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
+declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>)
+declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>)
+declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
+declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
+
+attributes #0 = { sanitize_memory }
More information about the llvm-commits
mailing list